summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--vp9/common/vp9_entropymv.c40
-rw-r--r--vp9/common/vp9_entropymv.h3
-rw-r--r--vp9/common/vp9_quant_common.c31
-rw-r--r--vp9/common/vp9_reconintra.h5
-rw-r--r--vp9/common/vp9_reconintra4x4.c64
-rw-r--r--vp9/common/vp9_rtcd_defs.sh2
-rw-r--r--vp9/decoder/vp9_decodframe.c151
-rw-r--r--vp9/encoder/vp9_dct.c420
-rw-r--r--vp9/encoder/vp9_encodeframe.c10
-rw-r--r--vp9/encoder/vp9_encodeintra.c2
-rw-r--r--vp9/encoder/vp9_firstpass.c56
-rw-r--r--vp9/encoder/vp9_mcomp.h2
-rw-r--r--vp9/encoder/vp9_onyx_int.h5
-rw-r--r--vp9/encoder/vp9_rdopt.c4
-rw-r--r--vp9/encoder/x86/vp9_dct_sse2_intrinsics.c626
15 files changed, 1106 insertions, 315 deletions
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index 9a7be4578..89dea4edc 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -42,9 +42,10 @@ const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = {
-MV_CLASS_2, -MV_CLASS_3,
10, 12,
-MV_CLASS_4, -MV_CLASS_5,
- 14, 16,
- -MV_CLASS_6, -MV_CLASS_7,
- -MV_CLASS_8, -MV_CLASS_9,
+ -MV_CLASS_6, 14,
+ 16, 18,
+ -MV_CLASS_7, -MV_CLASS_8,
+ -MV_CLASS_9, -MV_CLASS_10,
};
struct vp9_token_struct vp9_mv_class_encodings[MV_CLASSES];
@@ -64,24 +65,24 @@ const nmv_context vp9_default_nmv_context = {
{32, 64, 96},
{
{ /* vert component */
- 128, /* sign */
- {224, 144, 192, 168, 192, 176, 192, 198, 198}, /* class */
- {216}, /* class0 */
- {136, 140, 148, 160, 176, 192, 224, 234, 234}, /* bits */
- {{128, 128, 64}, {96, 112, 64}}, /* class0_fp */
- {64, 96, 64}, /* fp */
- 160, /* class0_hp bit */
- 128, /* hp */
+ 128, /* sign */
+ {224, 144, 192, 168, 192, 176, 192, 198, 198, 245}, /* class */
+ {216}, /* class0 */
+ {136, 140, 148, 160, 176, 192, 224, 234, 234, 240}, /* bits */
+ {{128, 128, 64}, {96, 112, 64}}, /* class0_fp */
+ {64, 96, 64}, /* fp */
+ 160, /* class0_hp bit */
+ 128, /* hp */
},
{ /* hor component */
- 128, /* sign */
- {216, 128, 176, 160, 176, 176, 192, 198, 198}, /* class */
- {208}, /* class0 */
- {136, 140, 148, 160, 176, 192, 224, 234, 234}, /* bits */
- {{128, 128, 64}, {96, 112, 64}}, /* class0_fp */
- {64, 96, 64}, /* fp */
- 160, /* class0_hp bit */
- 128, /* hp */
+ 128, /* sign */
+ {216, 128, 176, 160, 176, 176, 192, 198, 198, 208}, /* class */
+ {208}, /* class0 */
+ {136, 140, 148, 160, 176, 192, 224, 234, 234, 240}, /* bits */
+ {{128, 128, 64}, {96, 112, 64}}, /* class0_fp */
+ {64, 96, 64}, /* fp */
+ 160, /* class0_hp bit */
+ 128, /* hp */
}
},
};
@@ -107,6 +108,7 @@ MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) {
else if (z < CLASS0_SIZE * 1024) c = MV_CLASS_7;
else if (z < CLASS0_SIZE * 2048) c = MV_CLASS_8;
else if (z < CLASS0_SIZE * 4096) c = MV_CLASS_9;
+ else if (z < CLASS0_SIZE * 8192) c = MV_CLASS_10;
else assert(0);
if (offset)
*offset = z - mv_class_base(c);
diff --git a/vp9/common/vp9_entropymv.h b/vp9/common/vp9_entropymv.h
index 33500069e..162d2b44f 100644
--- a/vp9/common/vp9_entropymv.h
+++ b/vp9/common/vp9_entropymv.h
@@ -49,7 +49,7 @@ extern const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2];
extern struct vp9_token_struct vp9_mv_joint_encodings [MV_JOINTS];
/* Symbols for coding magnitude class of nonzero components */
-#define MV_CLASSES 10
+#define MV_CLASSES 11
typedef enum {
MV_CLASS_0 = 0, /* (0, 2] integer pel */
MV_CLASS_1 = 1, /* (2, 4] integer pel */
@@ -61,6 +61,7 @@ typedef enum {
MV_CLASS_7 = 7, /* (128, 256] integer pel */
MV_CLASS_8 = 8, /* (256, 512] integer pel */
MV_CLASS_9 = 9, /* (512, 1024] integer pel */
+ MV_CLASS_10 = 10, /* (1024,2048] integer pel */
} MV_CLASS_TYPE;
extern const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2];
diff --git a/vp9/common/vp9_quant_common.c b/vp9/common/vp9_quant_common.c
index 119038121..90eb88ed3 100644
--- a/vp9/common/vp9_quant_common.c
+++ b/vp9/common/vp9_quant_common.c
@@ -52,21 +52,6 @@ int vp9_dc_quant(int QIndex, int Delta) {
return retval;
}
-int vp9_dc2quant(int QIndex, int Delta) {
- int retval;
-
- QIndex = QIndex + Delta;
-
- if (QIndex > MAXQ)
- QIndex = MAXQ;
- else if (QIndex < 0)
- QIndex = 0;
-
- retval = dc_qlookup[ QIndex ];
-
- return retval;
-
-}
int vp9_dc_uv_quant(int QIndex, int Delta) {
int retval;
@@ -94,22 +79,6 @@ int vp9_ac_yquant(int QIndex) {
return retval;
}
-int vp9_ac2quant(int QIndex, int Delta) {
- int retval;
-
- QIndex = QIndex + Delta;
-
- if (QIndex > MAXQ)
- QIndex = MAXQ;
- else if (QIndex < 0)
- QIndex = 0;
-
- retval = (ac_qlookup[ QIndex ] * 775) / 1000;
- if (retval < 4)
- retval = 4;
-
- return retval;
-}
int vp9_ac_uv_quant(int QIndex, int Delta) {
int retval;
diff --git a/vp9/common/vp9_reconintra.h b/vp9/common/vp9_reconintra.h
index 3031fb699..b97b6089d 100644
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h
@@ -17,9 +17,10 @@
void vp9_recon_intra_mbuv(MACROBLOCKD *xd);
B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
- int stride, int n);
+ int stride, int n,
+ int tx, int ty);
-B_PREDICTION_MODE vp9_find_bpred_context(BLOCKD *x);
+B_PREDICTION_MODE vp9_find_bpred_context(MACROBLOCKD *xd, BLOCKD *x);
#if CONFIG_COMP_INTERINTRA_PRED
void vp9_build_interintra_16x16_predictors_mb(MACROBLOCKD *xd,
diff --git a/vp9/common/vp9_reconintra4x4.c b/vp9/common/vp9_reconintra4x4.c
index 7fbee7c32..eab5ab495 100644
--- a/vp9/common/vp9_reconintra4x4.c
+++ b/vp9/common/vp9_reconintra4x4.c
@@ -15,17 +15,17 @@
#include "vp9_rtcd.h"
#if CONFIG_NEWBINTRAMODES
-static int find_grad_measure(uint8_t *x, int stride, int n, int t,
+static int find_grad_measure(uint8_t *x, int stride, int n, int tx, int ty,
int dx, int dy) {
int i, j;
int count = 0, gsum = 0, gdiv;
/* TODO: Make this code more efficient by breaking up into two loops */
- for (i = -t; i < n; ++i)
- for (j = -t; j < n; ++j) {
+ for (i = -ty; i < n; ++i)
+ for (j = -tx; j < n; ++j) {
int g;
if (i >= 0 && j >= 0) continue;
if (i + dy >= 0 && j + dx >= 0) continue;
- if (i + dy < -t || i + dy >= n || j + dx < -t || j + dx >= n) continue;
+ if (i + dy < -ty || i + dy >= n || j + dx < -tx || j + dx >= n) continue;
g = abs(x[(i + dy) * stride + j + dx] - x[i * stride + j]);
gsum += g * g;
count++;
@@ -36,14 +36,15 @@ static int find_grad_measure(uint8_t *x, int stride, int n, int t,
#if CONTEXT_PRED_REPLACEMENTS == 6
B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
- int stride, int n) {
+ int stride, int n,
+ int tx, int ty) {
int g[8], i, imin, imax;
- g[1] = find_grad_measure(ptr, stride, n, 4, 2, 1);
- g[2] = find_grad_measure(ptr, stride, n, 4, 1, 1);
- g[3] = find_grad_measure(ptr, stride, n, 4, 1, 2);
- g[5] = find_grad_measure(ptr, stride, n, 4, -1, 2);
- g[6] = find_grad_measure(ptr, stride, n, 4, -1, 1);
- g[7] = find_grad_measure(ptr, stride, n, 4, -2, 1);
+ g[1] = find_grad_measure(ptr, stride, n, tx, ty, 2, 1);
+ g[2] = find_grad_measure(ptr, stride, n, tx, ty, 1, 1);
+ g[3] = find_grad_measure(ptr, stride, n, tx, ty, 1, 2);
+ g[5] = find_grad_measure(ptr, stride, n, tx, ty, -1, 2);
+ g[6] = find_grad_measure(ptr, stride, n, tx, ty, -1, 1);
+ g[7] = find_grad_measure(ptr, stride, n, tx, ty, -2, 1);
imin = 1;
for (i = 2; i < 8; i += 1 + (i == 3))
imin = (g[i] < g[imin] ? i : imin);
@@ -73,12 +74,13 @@ B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
}
#elif CONTEXT_PRED_REPLACEMENTS == 4
B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
- int stride, int n) {
+ int stride, int n,
+ int tx, int ty) {
int g[8], i, imin, imax;
- g[1] = find_grad_measure(ptr, stride, n, 4, 2, 1);
- g[3] = find_grad_measure(ptr, stride, n, 4, 1, 2);
- g[5] = find_grad_measure(ptr, stride, n, 4, -1, 2);
- g[7] = find_grad_measure(ptr, stride, n, 4, -2, 1);
+ g[1] = find_grad_measure(ptr, stride, n, tx, ty, 2, 1);
+ g[3] = find_grad_measure(ptr, stride, n, tx, ty, 1, 2);
+ g[5] = find_grad_measure(ptr, stride, n, tx, ty, -1, 2);
+ g[7] = find_grad_measure(ptr, stride, n, tx, ty, -2, 1);
imin = 1;
for (i = 3; i < 8; i+=2)
imin = (g[i] < g[imin] ? i : imin);
@@ -104,16 +106,17 @@ B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
}
#elif CONTEXT_PRED_REPLACEMENTS == 0
B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
- int stride, int n) {
+ int stride, int n,
+ int tx, int ty) {
int g[8], i, imin, imax;
- g[0] = find_grad_measure(ptr, stride, n, 4, 1, 0);
- g[1] = find_grad_measure(ptr, stride, n, 4, 2, 1);
- g[2] = find_grad_measure(ptr, stride, n, 4, 1, 1);
- g[3] = find_grad_measure(ptr, stride, n, 4, 1, 2);
- g[4] = find_grad_measure(ptr, stride, n, 4, 0, 1);
- g[5] = find_grad_measure(ptr, stride, n, 4, -1, 2);
- g[6] = find_grad_measure(ptr, stride, n, 4, -1, 1);
- g[7] = find_grad_measure(ptr, stride, n, 4, -2, 1);
+ g[0] = find_grad_measure(ptr, stride, n, tx, ty, 1, 0);
+ g[1] = find_grad_measure(ptr, stride, n, tx, ty, 2, 1);
+ g[2] = find_grad_measure(ptr, stride, n, tx, ty, 1, 1);
+ g[3] = find_grad_measure(ptr, stride, n, tx, ty, 1, 2);
+ g[4] = find_grad_measure(ptr, stride, n, tx, ty, 0, 1);
+ g[5] = find_grad_measure(ptr, stride, n, tx, ty, -1, 2);
+ g[6] = find_grad_measure(ptr, stride, n, tx, ty, -1, 1);
+ g[7] = find_grad_measure(ptr, stride, n, tx, ty, -2, 1);
imax = 0;
for (i = 1; i < 8; i++)
imax = (g[i] > g[imax] ? i : imax);
@@ -144,10 +147,17 @@ B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
}
#endif
-B_PREDICTION_MODE vp9_find_bpred_context(BLOCKD *x) {
+B_PREDICTION_MODE vp9_find_bpred_context(MACROBLOCKD *xd, BLOCKD *x) {
+ const int block_idx = x - xd->block;
+ const int have_top = (block_idx >> 2) || xd->up_available;
+ const int have_left = (block_idx & 3) || xd->left_available;
uint8_t *ptr = *(x->base_dst) + x->dst;
int stride = x->dst_stride;
- return vp9_find_dominant_direction(ptr, stride, 4);
+ int tx = have_left ? 4 : 0;
+ int ty = have_top ? 4 : 0;
+ if (!have_left && !have_top)
+ return B_DC_PRED;
+ return vp9_find_dominant_direction(ptr, stride, 4, tx, ty);
}
#endif
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 3e941859e..0c2a5c94a 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -598,7 +598,7 @@ prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int
specialize vp9_short_fdct32x32
prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_fdct16x16
+specialize vp9_short_fdct16x16 sse2
prototype void vp9_short_walsh4x4 "int16_t *InputData, int16_t *OutputData, int pitch"
specialize vp9_short_walsh4x4
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 07cf227b8..d1d07c753 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -44,6 +44,21 @@
int dec_debug = 0;
#endif
+
+static int read_le16(const uint8_t *p) {
+ return (p[1] << 8) | p[0];
+}
+
+static int read_le32(const uint8_t *p) {
+ return (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
+}
+
+// len == 0 is not allowed
+static int read_is_valid(const unsigned char *start, size_t len,
+ const unsigned char *end) {
+ return start + len > start && start + len <= end;
+}
+
static int merge_index(int v, int n, int modulus) {
int max1 = (n - 1 - modulus / 2) / modulus + 1;
if (v < max1) v = v * modulus + modulus / 2;
@@ -61,14 +76,13 @@ static int merge_index(int v, int n, int modulus) {
static int inv_remap_prob(int v, int m) {
const int n = 256;
const int modulus = MODULUS_PARAM;
- int i;
+
v = merge_index(v, n - 1, modulus);
if ((m << 1) <= n) {
- i = vp9_inv_recenter_nonneg(v + 1, m);
+ return vp9_inv_recenter_nonneg(v + 1, m);
} else {
- i = n - 1 - vp9_inv_recenter_nonneg(v + 1, n - 1 - m);
+ return n - 1 - vp9_inv_recenter_nonneg(v + 1, n - 1 - m);
}
- return i;
}
static vp9_prob read_prob_diff_update(vp9_reader *const bc, int oldp) {
@@ -112,8 +126,8 @@ static void mb_init_dequantizer(VP9D_COMP *pbi, MACROBLOCKD *mb) {
int i;
VP9_COMMON *const pc = &pbi->common;
- int segment_id = mb->mode_info_context->mbmi.segment_id;
- int qindex = get_qindex(mb, segment_id, pc->base_qindex);
+ const int segment_id = mb->mode_info_context->mbmi.segment_id;
+ const int qindex = get_qindex(mb, segment_id, pc->base_qindex);
mb->q_index = qindex;
for (i = 0; i < 16; i++)
@@ -287,12 +301,14 @@ static void decode_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd,
int ib = vp9_i8x8_block[i];
BLOCKD *b = &xd->block[ib];
int i8x8mode = b->bmi.as_mode.first;
+
b = &xd->block[16 + i];
- vp9_intra_uv4x4_predict(xd, &xd->block[16 + i], i8x8mode, b->predictor);
+ vp9_intra_uv4x4_predict(xd, b, i8x8mode, b->predictor);
xd->itxm_add(b->qcoeff, b->dequant, b->predictor,
*(b->base_dst) + b->dst, 8, b->dst_stride, xd->eobs[16 + i]);
+
b = &xd->block[20 + i];
- vp9_intra_uv4x4_predict(xd, &xd->block[20 + i], i8x8mode, b->predictor);
+ vp9_intra_uv4x4_predict(xd, b, i8x8mode, b->predictor);
xd->itxm_add(b->qcoeff, b->dequant, b->predictor,
*(b->base_dst) + b->dst, 8, b->dst_stride, xd->eobs[20 + i]);
}
@@ -361,7 +377,7 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
int b_mode = xd->mode_info_context->bmi[i].as_mode.first;
#if CONFIG_NEWBINTRAMODES
xd->mode_info_context->bmi[i].as_mode.context = b->bmi.as_mode.context =
- vp9_find_bpred_context(b);
+ vp9_find_bpred_context(xd, b);
#endif
if (!xd->mode_info_context->mbmi.mb_skip_coeff)
eobtotal += vp9_decode_coefs_4x4(pbi, xd, bc, PLANE_TYPE_Y_WITH_DC, i);
@@ -798,9 +814,8 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
if (xd->mode_info_context->mbmi.mb_skip_coeff) {
vp9_reset_mb_tokens_context(xd);
} else if (!bool_error(bc)) {
- if (mode != B_PRED) {
+ if (mode != B_PRED)
eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);
- }
}
//mode = xd->mode_info_context->mbmi.mode;
@@ -923,10 +938,9 @@ static void set_offsets(VP9D_COMP *pbi, int block_size,
xd->above_context = cm->above_context + mb_col;
xd->left_context = cm->left_context + (mb_row & 3);
- /* Distance of Mb to the various image edges.
- * These are specified to 8th pel as they are always compared to
- * values that are in 1/8th pel units
- */
+ // Distance of Mb to the various image edges.
+ // These are specified to 8th pel as they are always compared to
+ // values that are in 1/8th pel units
block_size >>= 4; // in mb units
set_mb_row(cm, xd, mb_row, block_size);
@@ -937,37 +951,31 @@ static void set_offsets(VP9D_COMP *pbi, int block_size,
xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
}
-static void set_refs(VP9D_COMP *pbi, int block_size,
- int mb_row, int mb_col) {
+static void set_refs(VP9D_COMP *pbi, int block_size, int mb_row, int mb_col) {
VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
- MODE_INFO *mi = xd->mode_info_context;
- MB_MODE_INFO *const mbmi = &mi->mbmi;
+ MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
if (mbmi->ref_frame > INTRA_FRAME) {
- int ref_fb_idx;
-
- /* Select the appropriate reference frame for this MB */
- ref_fb_idx = cm->active_ref_idx[mbmi->ref_frame - 1];
+ // Select the appropriate reference frame for this MB
+ int ref_fb_idx = cm->active_ref_idx[mbmi->ref_frame - 1];
xd->scale_factor[0] = cm->active_ref_scale[mbmi->ref_frame - 1];
xd->scale_factor_uv[0] = cm->active_ref_scale[mbmi->ref_frame - 1];
setup_pred_block(&xd->pre, &cm->yv12_fb[ref_fb_idx], mb_row, mb_col,
&xd->scale_factor[0], &xd->scale_factor_uv[0]);
- /* propagate errors from reference frames */
+ // propagate errors from reference frames
xd->corrupted |= cm->yv12_fb[ref_fb_idx].corrupted;
if (mbmi->second_ref_frame > INTRA_FRAME) {
- int second_ref_fb_idx;
-
- /* Select the appropriate reference frame for this MB */
- second_ref_fb_idx = cm->active_ref_idx[mbmi->second_ref_frame - 1];
+ // Select the appropriate reference frame for this MB
+ int second_ref_fb_idx = cm->active_ref_idx[mbmi->second_ref_frame - 1];
setup_pred_block(&xd->second_pre, &cm->yv12_fb[second_ref_fb_idx],
mb_row, mb_col,
&xd->scale_factor[1], &xd->scale_factor_uv[1]);
- /* propagate errors from reference frames */
+ // propagate errors from reference frames
xd->corrupted |= cm->yv12_fb[second_ref_fb_idx].corrupted;
}
}
@@ -1054,15 +1062,6 @@ static void decode_sb_row(VP9D_COMP *pbi, VP9_COMMON *pc,
}
}
-static unsigned int read_partition_size(const unsigned char *cx_size) {
- return cx_size[0] + (cx_size[1] << 8) + (cx_size[2] << 16);
-}
-
-static int read_is_valid(const unsigned char *start, size_t len,
- const unsigned char *end) {
- return start + len > start && start + len <= end;
-}
-
static void setup_token_decoder(VP9D_COMP *pbi,
const unsigned char *cx_data,
@@ -1090,7 +1089,7 @@ static void setup_token_decoder(VP9D_COMP *pbi,
static void init_frame(VP9D_COMP *pbi) {
VP9_COMMON *const pc = &pbi->common;
- MACROBLOCKD *const xd = &pbi->mb;
+ MACROBLOCKD *const xd = &pbi->mb;
if (pc->frame_type == KEY_FRAME) {
vp9_setup_past_independence(pc, xd);
@@ -1113,7 +1112,6 @@ static void init_frame(VP9D_COMP *pbi) {
xd->mode_info_context->mbmi.mode = DC_PRED;
xd->mode_info_stride = pc->mode_info_stride;
xd->corrupted = 0;
-
xd->fullpixel_mask = pc->full_pixel ? 0xfffffff8 : 0xffffffff;
}
@@ -1241,8 +1239,8 @@ static void update_frame_size(VP9D_COMP *pbi) {
VP9_COMMON *cm = &pbi->common;
/* our internal buffers are always multiples of 16 */
- int width = (cm->Width + 15) & ~15;
- int height = (cm->Height + 15) & ~15;
+ const int width = (cm->Width + 15) & ~15;
+ const int height = (cm->Height + 15) & ~15;
cm->mb_rows = height >> 4;
cm->mb_cols = width >> 4;
@@ -1261,8 +1259,8 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
BOOL_DECODER header_bc, residual_bc;
VP9_COMMON *const pc = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
- const unsigned char *data = (const unsigned char *)pbi->Source;
- const unsigned char *data_end = data + pbi->source_sz;
+ const uint8_t *data = (const uint8_t *)pbi->Source;
+ const uint8_t *data_end = data + pbi->source_sz;
ptrdiff_t first_partition_length_in_bytes = 0;
int mb_row;
@@ -1284,8 +1282,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
first_partition_length_in_bytes =
(data[0] | (data[1] << 8) | (data[2] << 16)) >> 5;
- if ((data + first_partition_length_in_bytes > data_end
- || data + first_partition_length_in_bytes < data))
+ if (!read_is_valid(data, first_partition_length_in_bytes, data_end))
vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
"Truncated packet or corrupt partition 0 length");
@@ -1314,8 +1311,8 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
* size.
*/
if (data + 5 < data_end) {
- pc->Width = (data[0] | (data[1] << 8));
- pc->Height = (data[2] | (data[3] << 8));
+ pc->Width = read_le16(data);
+ pc->Height = read_le16(data + 2);
pc->horiz_scale = data[4] >> 4;
pc->vert_scale = data[4] & 0x0F;
@@ -1467,7 +1464,6 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
pc->ref_pred_probs[0] = 120;
pc->ref_pred_probs[1] = 80;
pc->ref_pred_probs[2] = 40;
-
} else {
for (i = 0; i < PREDICTION_PROBS; i++) {
if (vp9_read_bit(&header_bc))
@@ -1481,10 +1477,11 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
if (xd->lossless) {
pc->txfm_mode = ONLY_4X4;
} else {
- /* Read the loop filter level and type */
+ // Read the loop filter level and type
pc->txfm_mode = vp9_read_literal(&header_bc, 2);
if (pc->txfm_mode == 3)
pc->txfm_mode += vp9_read_bit(&header_bc);
+
if (pc->txfm_mode == TX_MODE_SELECT) {
pc->prob_tx[0] = vp9_read_literal(&header_bc, 8);
pc->prob_tx[1] = vp9_read_literal(&header_bc, 8);
@@ -1511,7 +1508,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
xd->ref_lf_deltas[i] = (signed char)vp9_read_literal(&header_bc, 6);
if (vp9_read_bit(&header_bc)) /* Apply sign */
- xd->ref_lf_deltas[i] = xd->ref_lf_deltas[i] * -1;
+ xd->ref_lf_deltas[i] = -xd->ref_lf_deltas[i];
}
}
@@ -1522,7 +1519,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
xd->mode_lf_deltas[i] = (signed char)vp9_read_literal(&header_bc, 6);
if (vp9_read_bit(&header_bc)) /* Apply sign */
- xd->mode_lf_deltas[i] = xd->mode_lf_deltas[i] * -1;
+ xd->mode_lf_deltas[i] = -xd->mode_lf_deltas[i];
}
}
}
@@ -1570,14 +1567,13 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp9_read_bit(&header_bc);
pc->ref_frame_sign_bias[ALTREF_FRAME] = vp9_read_bit(&header_bc);
- /* Is high precision mv allowed */
+ // Is high precision mv allowed
xd->allow_high_precision_mv = (unsigned char)vp9_read_bit(&header_bc);
+
// Read the type of subpel filter to use
- if (vp9_read_bit(&header_bc)) {
- pc->mcomp_filter_type = SWITCHABLE;
- } else {
- pc->mcomp_filter_type = vp9_read_literal(&header_bc, 2);
- }
+ pc->mcomp_filter_type = vp9_read_bit(&header_bc) ? SWITCHABLE :
+ vp9_read_literal(&header_bc, 2);
+
#if CONFIG_COMP_INTERINTRA_PRED
pc->use_interintra = vp9_read_bit(&header_bc);
#endif
@@ -1731,7 +1727,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
/* tile info */
{
- const unsigned char *data_ptr = data + first_partition_length_in_bytes;
+ const uint8_t *data_ptr = data + first_partition_length_in_bytes;
int tile_row, tile_col, delta_log2_tiles;
vp9_get_tile_n_bits(pc, &pc->log2_tile_columns, &delta_log2_tiles);
@@ -1753,26 +1749,20 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
if (pbi->oxcf.inv_tile_order) {
const int n_cols = pc->tile_columns;
- const unsigned char *data_ptr2[4][1 << 6];
+ const uint8_t *data_ptr2[4][1 << 6];
BOOL_DECODER UNINITIALIZED_IS_SAFE(bc_bak);
// pre-initialize the offsets, we're going to read in inverse order
data_ptr2[0][0] = data_ptr;
for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {
if (tile_row) {
- int size = data_ptr2[tile_row - 1][n_cols - 1][0] |
- (data_ptr2[tile_row - 1][n_cols - 1][1] << 8) |
- (data_ptr2[tile_row - 1][n_cols - 1][2] << 16) |
- (data_ptr2[tile_row - 1][n_cols - 1][3] << 24);
+ const int size = read_le32(data_ptr2[tile_row - 1][n_cols - 1]);
data_ptr2[tile_row - 1][n_cols - 1] += 4;
data_ptr2[tile_row][0] = data_ptr2[tile_row - 1][n_cols - 1] + size;
}
for (tile_col = 1; tile_col < n_cols; tile_col++) {
- int size = data_ptr2[tile_row][tile_col - 1][0] |
- (data_ptr2[tile_row][tile_col - 1][1] << 8) |
- (data_ptr2[tile_row][tile_col - 1][2] << 16) |
- (data_ptr2[tile_row][tile_col - 1][3] << 24);
+ const int size = read_le32(data_ptr2[tile_row][tile_col - 1]);
data_ptr2[tile_row][tile_col - 1] += 4;
data_ptr2[tile_row][tile_col] =
data_ptr2[tile_row][tile_col - 1] + size;
@@ -1813,10 +1803,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
}
if (tile_col < pc->tile_columns - 1 || tile_row < pc->tile_rows - 1) {
- int size = data_ptr[0] |
- (data_ptr[1] << 8) |
- (data_ptr[2] << 16) |
- (data_ptr[3] << 24);
+ int size = read_le32(data_ptr);
data_ptr += 4 + size;
}
}
@@ -1829,31 +1816,29 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
pc->last_width = pc->Width;
pc->last_height = pc->Height;
- /* Collect information about decoder corruption. */
- /* 1. Check first boolean decoder for errors. */
- pc->yv12_fb[pc->new_fb_idx].corrupted = bool_error(&header_bc);
- /* 2. Check the macroblock information */
- pc->yv12_fb[pc->new_fb_idx].corrupted |= corrupt_tokens;
+ // Collect information about decoder corruption.
+ // 1. Check first boolean decoder for errors.
+ // 2. Check the macroblock information
+ pc->yv12_fb[pc->new_fb_idx].corrupted = bool_error(&header_bc) |
+ corrupt_tokens;
if (!pbi->decoded_key_frame) {
- if (pc->frame_type == KEY_FRAME &&
- !pc->yv12_fb[pc->new_fb_idx].corrupted)
+ if (pc->frame_type == KEY_FRAME && !pc->yv12_fb[pc->new_fb_idx].corrupted)
pbi->decoded_key_frame = 1;
else
vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME,
"A stream must start with a complete key frame");
}
- if (!pc->error_resilient_mode &&
- !pc->frame_parallel_decoding_mode) {
+ if (!pc->error_resilient_mode && !pc->frame_parallel_decoding_mode) {
vp9_adapt_coef_probs(pc);
#if CONFIG_CODE_NONZEROCOUNT
vp9_adapt_nzc_probs(pc);
#endif
}
+
if (pc->frame_type != KEY_FRAME) {
- if (!pc->error_resilient_mode &&
- !pc->frame_parallel_decoding_mode) {
+ if (!pc->error_resilient_mode && !pc->frame_parallel_decoding_mode) {
vp9_adapt_mode_probs(pc);
vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv);
vp9_adapt_mode_context(&pbi->common);
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 6c0ab863a..6365ed9a2 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -269,6 +269,185 @@ void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int pitch) {
}
}
+void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) {
+ // The 2D transform is done with two passes which are actually pretty
+ // similar. In the first one, we transform the columns and transpose
+ // the results. In the second one, we transform the rows. To achieve that,
+ // as the first pass results are transposed, we tranpose the columns (that
+ // is the transposed rows) and transpose the results (so that it goes back
+ // in normal/row positions).
+ const int stride = pitch >> 1;
+ int pass;
+ // We need an intermediate buffer between passes.
+ int16_t intermediate[256];
+ int16_t *in = input;
+ int16_t *out = intermediate;
+ // Do the two transform/transpose passes
+ for (pass = 0; pass < 2; ++pass) {
+ /*canbe16*/ int step1[8];
+ /*canbe16*/ int step2[8];
+ /*canbe16*/ int step3[8];
+ /*canbe16*/ int input[8];
+ /*needs32*/ int temp1, temp2;
+ int i;
+ for (i = 0; i < 16; i++) {
+ if (0 == pass) {
+ // Calculate input for the first 8 results.
+ input[0] = (in[0 * stride] + in[15 * stride]) << 2;
+ input[1] = (in[1 * stride] + in[14 * stride]) << 2;
+ input[2] = (in[2 * stride] + in[13 * stride]) << 2;
+ input[3] = (in[3 * stride] + in[12 * stride]) << 2;
+ input[4] = (in[4 * stride] + in[11 * stride]) << 2;
+ input[5] = (in[5 * stride] + in[10 * stride]) << 2;
+ input[6] = (in[6 * stride] + in[ 9 * stride]) << 2;
+ input[7] = (in[7 * stride] + in[ 8 * stride]) << 2;
+ // Calculate input for the next 8 results.
+ step1[0] = (in[7 * stride] - in[ 8 * stride]) << 2;
+ step1[1] = (in[6 * stride] - in[ 9 * stride]) << 2;
+ step1[2] = (in[5 * stride] - in[10 * stride]) << 2;
+ step1[3] = (in[4 * stride] - in[11 * stride]) << 2;
+ step1[4] = (in[3 * stride] - in[12 * stride]) << 2;
+ step1[5] = (in[2 * stride] - in[13 * stride]) << 2;
+ step1[6] = (in[1 * stride] - in[14 * stride]) << 2;
+ step1[7] = (in[0 * stride] - in[15 * stride]) << 2;
+ } else {
+ // Calculate input for the first 8 results.
+ input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
+ input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
+ input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
+ input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
+ input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
+ input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
+ input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
+ input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
+ // Calculate input for the next 8 results.
+ step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
+ step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
+ step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
+ step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
+ step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
+ step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
+ step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
+ step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
+ }
+ // Work on the first eight values; fdct8_1d(input, even_results);
+ {
+ /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
+ /*needs32*/ int t0, t1, t2, t3;
+ /*canbe16*/ int x0, x1, x2, x3;
+
+ // stage 1
+ s0 = input[0] + input[7];
+ s1 = input[1] + input[6];
+ s2 = input[2] + input[5];
+ s3 = input[3] + input[4];
+ s4 = input[3] - input[4];
+ s5 = input[2] - input[5];
+ s6 = input[1] - input[6];
+ s7 = input[0] - input[7];
+
+ // fdct4_1d(step, step);
+ x0 = s0 + s3;
+ x1 = s1 + s2;
+ x2 = s1 - s2;
+ x3 = s0 - s3;
+ t0 = (x0 + x1) * cospi_16_64;
+ t1 = (x0 - x1) * cospi_16_64;
+ t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
+ t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
+ out[0] = dct_const_round_shift(t0);
+ out[4] = dct_const_round_shift(t2);
+ out[8] = dct_const_round_shift(t1);
+ out[12] = dct_const_round_shift(t3);
+
+ // Stage 2
+ t0 = (s6 - s5) * cospi_16_64;
+ t1 = (s6 + s5) * cospi_16_64;
+ t2 = dct_const_round_shift(t0);
+ t3 = dct_const_round_shift(t1);
+
+ // Stage 3
+ x0 = s4 + t2;
+ x1 = s4 - t2;
+ x2 = s7 - t3;
+ x3 = s7 + t3;
+
+ // Stage 4
+ t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
+ t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
+ t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+ t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
+ out[2] = dct_const_round_shift(t0);
+ out[6] = dct_const_round_shift(t2);
+ out[10] = dct_const_round_shift(t1);
+ out[14] = dct_const_round_shift(t3);
+ }
+ // Work on the next eight values; step1 -> odd_results
+ {
+ // step 2
+ temp1 = (step1[5] - step1[2]) * cospi_16_64;
+ temp2 = (step1[4] - step1[3]) * cospi_16_64;
+ step2[2] = dct_const_round_shift(temp1);
+ step2[3] = dct_const_round_shift(temp2);
+ temp1 = (step1[4] + step1[3]) * cospi_16_64;
+ temp2 = (step1[5] + step1[2]) * cospi_16_64;
+ step2[4] = dct_const_round_shift(temp1);
+ step2[5] = dct_const_round_shift(temp2);
+ // step 3
+ step3[0] = step1[0] + step2[3];
+ step3[1] = step1[1] + step2[2];
+ step3[2] = step1[1] - step2[2];
+ step3[3] = step1[0] - step2[3];
+ step3[4] = step1[7] - step2[4];
+ step3[5] = step1[6] - step2[5];
+ step3[6] = step1[6] + step2[5];
+ step3[7] = step1[7] + step2[4];
+ // step 4
+ temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
+ temp2 = step3[2] * -cospi_24_64 - step3[5] * cospi_8_64;
+ step2[1] = dct_const_round_shift(temp1);
+ step2[2] = dct_const_round_shift(temp2);
+ temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
+ temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
+ step2[5] = dct_const_round_shift(temp1);
+ step2[6] = dct_const_round_shift(temp2);
+ // step 5
+ step1[0] = step3[0] + step2[1];
+ step1[1] = step3[0] - step2[1];
+ step1[2] = step3[3] - step2[2];
+ step1[3] = step3[3] + step2[2];
+ step1[4] = step3[4] + step2[5];
+ step1[5] = step3[4] - step2[5];
+ step1[6] = step3[7] - step2[6];
+ step1[7] = step3[7] + step2[6];
+ // step 6
+ temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
+ temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
+ out[1] = dct_const_round_shift(temp1);
+ out[9] = dct_const_round_shift(temp2);
+ temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
+ temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
+ out[5] = dct_const_round_shift(temp1);
+ out[13] = dct_const_round_shift(temp2);
+ temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
+ temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
+ out[3] = dct_const_round_shift(temp1);
+ out[11] = dct_const_round_shift(temp2);
+ temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
+ temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
+ out[7] = dct_const_round_shift(temp1);
+ out[15] = dct_const_round_shift(temp2);
+ }
+ // Do next column (which is a transposed row in second/horizontal pass)
+ in++;
+ out += 16;
+ }
+ // Setup in/out for next pass.
+ in = intermediate;
+ out = output;
+ }
+}
+
static void fadst8_1d(int16_t *input, int16_t *output) {
int s0, s1, s2, s3, s4, s5, s6, s7;
@@ -421,132 +600,145 @@ void vp9_short_walsh8x4_c(short *input, short *output, int pitch) {
// Rewrote to use same algorithm as others.
-static void fdct16_1d(int16_t input[16], int16_t output[16]) {
- int16_t step[16];
- int temp1, temp2;
+static void fdct16_1d(int16_t in[16], int16_t out[16]) {
+ /*canbe16*/ int step1[8];
+ /*canbe16*/ int step2[8];
+ /*canbe16*/ int step3[8];
+ /*canbe16*/ int input[8];
+ /*needs32*/ int temp1, temp2;
// step 1
- step[ 0] = input[0] + input[15];
- step[ 1] = input[1] + input[14];
- step[ 2] = input[2] + input[13];
- step[ 3] = input[3] + input[12];
- step[ 4] = input[4] + input[11];
- step[ 5] = input[5] + input[10];
- step[ 6] = input[6] + input[ 9];
- step[ 7] = input[7] + input[ 8];
- step[ 8] = input[7] - input[ 8];
- step[ 9] = input[6] - input[ 9];
- step[10] = input[5] - input[10];
- step[11] = input[4] - input[11];
- step[12] = input[3] - input[12];
- step[13] = input[2] - input[13];
- step[14] = input[1] - input[14];
- step[15] = input[0] - input[15];
-
- fdct8_1d(step, step);
+ input[0] = in[0] + in[15];
+ input[1] = in[1] + in[14];
+ input[2] = in[2] + in[13];
+ input[3] = in[3] + in[12];
+ input[4] = in[4] + in[11];
+ input[5] = in[5] + in[10];
+ input[6] = in[6] + in[ 9];
+ input[7] = in[7] + in[ 8];
+
+ step1[0] = in[7] - in[ 8];
+ step1[1] = in[6] - in[ 9];
+ step1[2] = in[5] - in[10];
+ step1[3] = in[4] - in[11];
+ step1[4] = in[3] - in[12];
+ step1[5] = in[2] - in[13];
+ step1[6] = in[1] - in[14];
+ step1[7] = in[0] - in[15];
+
+ // fdct8_1d(step, step);
+ {
+ /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
+ /*needs32*/ int t0, t1, t2, t3;
+ /*canbe16*/ int x0, x1, x2, x3;
+
+ // stage 1
+ s0 = input[0] + input[7];
+ s1 = input[1] + input[6];
+ s2 = input[2] + input[5];
+ s3 = input[3] + input[4];
+ s4 = input[3] - input[4];
+ s5 = input[2] - input[5];
+ s6 = input[1] - input[6];
+ s7 = input[0] - input[7];
+
+ // fdct4_1d(step, step);
+ x0 = s0 + s3;
+ x1 = s1 + s2;
+ x2 = s1 - s2;
+ x3 = s0 - s3;
+ t0 = (x0 + x1) * cospi_16_64;
+ t1 = (x0 - x1) * cospi_16_64;
+ t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
+ t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
+ out[0] = dct_const_round_shift(t0);
+ out[4] = dct_const_round_shift(t2);
+ out[8] = dct_const_round_shift(t1);
+ out[12] = dct_const_round_shift(t3);
+
+ // Stage 2
+ t0 = (s6 - s5) * cospi_16_64;
+ t1 = (s6 + s5) * cospi_16_64;
+ t2 = dct_const_round_shift(t0);
+ t3 = dct_const_round_shift(t1);
+
+ // Stage 3
+ x0 = s4 + t2;
+ x1 = s4 - t2;
+ x2 = s7 - t3;
+ x3 = s7 + t3;
+
+ // Stage 4
+ t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
+ t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
+ t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+ t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
+ out[2] = dct_const_round_shift(t0);
+ out[6] = dct_const_round_shift(t2);
+ out[10] = dct_const_round_shift(t1);
+ out[14] = dct_const_round_shift(t3);
+ }
// step 2
- output[8] = step[8];
- output[9] = step[9];
- temp1 = (-step[10] + step[13]) * cospi_16_64;
- temp2 = (-step[11] + step[12]) * cospi_16_64;
- output[10] = dct_const_round_shift(temp1);
- output[11] = dct_const_round_shift(temp2);
- temp1 = (step[11] + step[12]) * cospi_16_64;
- temp2 = (step[10] + step[13]) * cospi_16_64;
- output[12] = dct_const_round_shift(temp1);
- output[13] = dct_const_round_shift(temp2);
- output[14] = step[14];
- output[15] = step[15];
+ temp1 = (step1[5] - step1[2]) * cospi_16_64;
+ temp2 = (step1[4] - step1[3]) * cospi_16_64;
+ step2[2] = dct_const_round_shift(temp1);
+ step2[3] = dct_const_round_shift(temp2);
+ temp1 = (step1[4] + step1[3]) * cospi_16_64;
+ temp2 = (step1[5] + step1[2]) * cospi_16_64;
+ step2[4] = dct_const_round_shift(temp1);
+ step2[5] = dct_const_round_shift(temp2);
// step 3
- step[ 8] = output[8] + output[11];
- step[ 9] = output[9] + output[10];
- step[ 10] = output[9] - output[10];
- step[ 11] = output[8] - output[11];
- step[ 12] = -output[12] + output[15];
- step[ 13] = -output[13] + output[14];
- step[ 14] = output[13] + output[14];
- step[ 15] = output[12] + output[15];
+ step3[0] = step1[0] + step2[3];
+ step3[1] = step1[1] + step2[2];
+ step3[2] = step1[1] - step2[2];
+ step3[3] = step1[0] - step2[3];
+ step3[4] = step1[7] - step2[4];
+ step3[5] = step1[6] - step2[5];
+ step3[6] = step1[6] + step2[5];
+ step3[7] = step1[7] + step2[4];
// step 4
- output[8] = step[8];
- temp1 = -step[9] * cospi_8_64 + step[14] * cospi_24_64;
- temp2 = -step[10] * cospi_24_64 - step[13] * cospi_8_64;
- output[9] = dct_const_round_shift(temp1);
- output[10] = dct_const_round_shift(temp2);
- output[11] = step[11];
- output[12] = step[12];
- temp1 = -step[10] * cospi_8_64 + step[13] * cospi_24_64;
- temp2 = step[9] * cospi_24_64 + step[14] * cospi_8_64;
- output[13] = dct_const_round_shift(temp1);
- output[14] = dct_const_round_shift(temp2);
- output[15] = step[15];
+ temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
+ temp2 = step3[2] * -cospi_24_64 - step3[5] * cospi_8_64;
+ step2[1] = dct_const_round_shift(temp1);
+ step2[2] = dct_const_round_shift(temp2);
+ temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
+ temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
+ step2[5] = dct_const_round_shift(temp1);
+ step2[6] = dct_const_round_shift(temp2);
// step 5
- step[8] = output[8] + output[9];
- step[9] = output[8] - output[9];
- step[10] = -output[10] + output[11];
- step[11] = output[10] + output[11];
- step[12] = output[12] + output[13];
- step[13] = output[12] - output[13];
- step[14] = -output[14] + output[15];
- step[15] = output[14] + output[15];
+ step1[0] = step3[0] + step2[1];
+ step1[1] = step3[0] - step2[1];
+ step1[2] = step3[3] - step2[2];
+ step1[3] = step3[3] + step2[2];
+ step1[4] = step3[4] + step2[5];
+ step1[5] = step3[4] - step2[5];
+ step1[6] = step3[7] - step2[6];
+ step1[7] = step3[7] + step2[6];
// step 6
- output[0] = step[0];
- output[8] = step[4];
- output[4] = step[2];
- output[12] = step[6];
- output[2] = step[1];
- output[10] = step[5];
- output[6] = step[3];
- output[14] = step[7];
-
- temp1 = step[8] * cospi_30_64 + step[15] * cospi_2_64;
- temp2 = step[9] * cospi_14_64 + step[14] * cospi_18_64;
- output[1] = dct_const_round_shift(temp1);
- output[9] = dct_const_round_shift(temp2);
-
- temp1 = step[10] * cospi_22_64 + step[13] * cospi_10_64;
- temp2 = step[11] * cospi_6_64 + step[12] * cospi_26_64;
- output[5] = dct_const_round_shift(temp1);
- output[13] = dct_const_round_shift(temp2);
-
- temp1 = -step[11] * cospi_26_64 + step[12] * cospi_6_64;
- temp2 = -step[10] * cospi_10_64 + step[13] * cospi_22_64;
- output[3] = dct_const_round_shift(temp1);
- output[11] = dct_const_round_shift(temp2);
-
- temp1 = -step[9] * cospi_18_64 + step[14] * cospi_14_64;
- temp2 = -step[8] * cospi_2_64 + step[15] * cospi_30_64;
- output[7] = dct_const_round_shift(temp1);
- output[15] = dct_const_round_shift(temp2);
-}
-
-void vp9_short_fdct16x16_c(int16_t *input, int16_t *out, int pitch) {
- int shortpitch = pitch >> 1;
- int i, j;
- int16_t output[256];
- int16_t temp_in[16], temp_out[16];
-
- // First transform columns
- for (i = 0; i < 16; i++) {
- for (j = 0; j < 16; j++)
- temp_in[j] = input[j * shortpitch + i] << 2;
- fdct16_1d(temp_in, temp_out);
- for (j = 0; j < 16; j++)
- output[j * 16 + i] = (temp_out[j] + 1) >> 2;
- }
-
- // Then transform rows
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j)
- temp_in[j] = output[j + i * 16];
- fdct16_1d(temp_in, temp_out);
- for (j = 0; j < 16; ++j)
- out[j + i * 16] = temp_out[j];
- }
+ temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
+ temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
+ out[1] = dct_const_round_shift(temp1);
+ out[9] = dct_const_round_shift(temp2);
+
+ temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
+ temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
+ out[5] = dct_const_round_shift(temp1);
+ out[13] = dct_const_round_shift(temp2);
+
+ temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
+ temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
+ out[3] = dct_const_round_shift(temp1);
+ out[11] = dct_const_round_shift(temp2);
+
+ temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
+ temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
+ out[7] = dct_const_round_shift(temp1);
+ out[15] = dct_const_round_shift(temp2);
}
void fadst16_1d(int16_t *input, int16_t *output) {
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 8922661fe..428e585e1 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -594,9 +594,6 @@ static void update_state(VP9_COMP *cpi,
[vp9_switchable_interp_map[mbmi->interp_filter]];
}
- cpi->prediction_error += ctx->distortion;
- cpi->intra_error += ctx->intra_error;
-
cpi->rd_comp_pred_diff[SINGLE_PREDICTION_ONLY] += ctx->single_pred_diff;
cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff;
cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff;
@@ -1265,8 +1262,6 @@ static void encode_frame_internal(VP9_COMP *cpi) {
// Reset frame count of inter 0,0 motion vector usage.
cpi->inter_zz_count = 0;
- cpi->prediction_error = 0;
- cpi->intra_error = 0;
cpi->skip_true_count[0] = cpi->skip_true_count[1] = cpi->skip_true_count[2] = 0;
cpi->skip_false_count[0] = cpi->skip_false_count[1] = cpi->skip_false_count[2] = 0;
@@ -1292,8 +1287,9 @@ static void encode_frame_internal(VP9_COMP *cpi) {
vp9_zero(cpi->mb_mv_ref_count);
#endif
-
- // force lossless mode when Q0 is selected
+ // force lossless mode
+ if (cm->base_qindex <= 4)
+ cm->base_qindex = 0;
cpi->mb.e_mbd.lossless = (cm->base_qindex == 0 &&
cm->y1dc_delta_q == 0 &&
cm->uvdc_delta_q == 0 &&
diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c
index 3c98d4aa6..9e5bcea16 100644
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -44,7 +44,7 @@ void vp9_encode_intra4x4block(MACROBLOCK *x, int ib) {
TX_TYPE tx_type;
#if CONFIG_NEWBINTRAMODES
- b->bmi.as_mode.context = vp9_find_bpred_context(b);
+ b->bmi.as_mode.context = vp9_find_bpred_context(&x->e_mbd, b);
#endif
vp9_intra4x4_predict(&x->e_mbd, b, b->bmi.as_mode.first, b->predictor);
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 5c2067b00..d646fe222 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -859,6 +859,8 @@ static double calc_correction_factor(double err_per_mb,
power_term = (power_term > pt_high) ? pt_high : power_term;
// Calculate correction factor
+ if (power_term < 1.0)
+ assert(error_term >= 0.0);
correction_factor = pow(error_term, power_term);
// Clip range
@@ -920,15 +922,19 @@ static int estimate_max_q(VP9_COMP *cpi,
// Look at the drop in prediction quality between the last frame
// and the GF buffer (which contained an older frame).
- sr_err_diff =
- (fpstats->sr_coded_error - fpstats->coded_error) /
- (fpstats->count * cpi->common.MBs);
- sr_correction = (sr_err_diff / 32.0);
- sr_correction = pow(sr_correction, 0.25);
- if (sr_correction < 0.75)
+ if (fpstats->sr_coded_error > fpstats->coded_error) {
+ sr_err_diff =
+ (fpstats->sr_coded_error - fpstats->coded_error) /
+ (fpstats->count * cpi->common.MBs);
+ sr_correction = (sr_err_diff / 32.0);
+ sr_correction = pow(sr_correction, 0.25);
+ if (sr_correction < 0.75)
+ sr_correction = 0.75;
+ else if (sr_correction > 1.25)
+ sr_correction = 1.25;
+ } else {
sr_correction = 0.75;
- else if (sr_correction > 1.25)
- sr_correction = 1.25;
+ }
// Calculate a corrective factor based on a rolling ratio of bits spent
// vs target bits
@@ -1031,15 +1037,19 @@ static int estimate_cq(VP9_COMP *cpi,
// Look at the drop in prediction quality between the last frame
// and the GF buffer (which contained an older frame).
- sr_err_diff =
- (fpstats->sr_coded_error - fpstats->coded_error) /
- (fpstats->count * cpi->common.MBs);
- sr_correction = (sr_err_diff / 32.0);
- sr_correction = pow(sr_correction, 0.25);
- if (sr_correction < 0.75)
+ if (fpstats->sr_coded_error > fpstats->coded_error) {
+ sr_err_diff =
+ (fpstats->sr_coded_error - fpstats->coded_error) /
+ (fpstats->count * cpi->common.MBs);
+ sr_correction = (sr_err_diff / 32.0);
+ sr_correction = pow(sr_correction, 0.25);
+ if (sr_correction < 0.75)
+ sr_correction = 0.75;
+ else if (sr_correction > 1.25)
+ sr_correction = 1.25;
+ } else {
sr_correction = 0.75;
- else if (sr_correction > 1.25)
- sr_correction = 1.25;
+ }
// II ratio correction factor for clip as a whole
clip_iiratio = cpi->twopass.total_stats->intra_error /
@@ -1178,12 +1188,16 @@ static double get_prediction_decay_rate(VP9_COMP *cpi,
mb_sr_err_diff =
(next_frame->sr_coded_error - next_frame->coded_error) /
(cpi->common.MBs);
- second_ref_decay = 1.0 - (mb_sr_err_diff / 512.0);
- second_ref_decay = pow(second_ref_decay, 0.5);
- if (second_ref_decay < 0.85)
+ if (mb_sr_err_diff <= 512.0) {
+ second_ref_decay = 1.0 - (mb_sr_err_diff / 512.0);
+ second_ref_decay = pow(second_ref_decay, 0.5);
+ if (second_ref_decay < 0.85)
+ second_ref_decay = 0.85;
+ else if (second_ref_decay > 1.0)
+ second_ref_decay = 1.0;
+ } else {
second_ref_decay = 0.85;
- else if (second_ref_decay > 1.0)
- second_ref_decay = 1.0;
+ }
if (second_ref_decay < prediction_decay_rate)
prediction_decay_rate = second_ref_decay;
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index d5c7032a9..fd1bb2b4e 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -21,7 +21,7 @@ void print_mode_context(VP9_COMMON *pc);
// The maximum number of steps in a step search given the largest
// allowed initial step
-#define MAX_MVSEARCH_STEPS 10
+#define MAX_MVSEARCH_STEPS 11
// Max full pel mv specified in 1 pel units
#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1)
// Maximum size of the first step in full pel units
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 300e12869..2e9f96c09 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -390,11 +390,6 @@ typedef struct VP9_COMP {
CODING_CONTEXT coding_context;
// Rate targetting variables
- int64_t prediction_error;
- int64_t last_prediction_error;
- int64_t intra_error;
- int64_t last_intra_error;
-
int this_frame_target;
int projected_frame_size;
int last_q[2]; // Separate values for Intra/Inter
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 3004d6bf3..d21224318 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1168,7 +1168,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
DECLARE_ALIGNED_ARRAY(16, int16_t, best_dqcoeff, 16);
#if CONFIG_NEWBINTRAMODES
- b->bmi.as_mode.context = vp9_find_bpred_context(b);
+ b->bmi.as_mode.context = vp9_find_bpred_context(xd, b);
#endif
xd->mode_info_context->mbmi.txfm_size = TX_4X4;
for (mode = B_DC_PRED; mode < LEFT4X4; mode++) {
@@ -1279,7 +1279,7 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
bmode_costs = mb->bmode_costs[A][L];
}
#if CONFIG_NEWBINTRAMODES
- mic->bmi[i].as_mode.context = vp9_find_bpred_context(xd->block + i);
+ mic->bmi[i].as_mode.context = vp9_find_bpred_context(xd, xd->block + i);
#endif
total_rd += rd_pick_intra4x4block(
diff --git a/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c b/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c
index ff884d999..28c4c754e 100644
--- a/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c
+++ b/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c
@@ -270,3 +270,629 @@ void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
_mm_storeu_si128 ((__m128i *)(output + 7 * 8), in7);
}
}
+
+void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
+ // The 2D transform is done with two passes which are actually pretty
+ // similar. In the first one, we transform the columns and transpose
+ // the results. In the second one, we transform the rows. To achieve that,
+ // as the first pass results are transposed, we tranpose the columns (that
+ // is the transposed rows) and transpose the results (so that it goes back
+ // in normal/row positions).
+ const int stride = pitch >> 1;
+ int pass;
+ // We need an intermediate buffer between passes.
+ int16_t intermediate[256];
+ int16_t *in = input;
+ int16_t *out = intermediate;
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+ const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+ const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+ const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+ const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i kOne = _mm_set1_epi16(1);
+ // Do the two transform/transpose passes
+ for (pass = 0; pass < 2; ++pass) {
+ // We process eight columns (transposed rows in second pass) at a time.
+ int column_start;
+ for (column_start = 0; column_start < 16; column_start += 8) {
+ __m128i in00, in01, in02, in03, in04, in05, in06, in07;
+ __m128i in08, in09, in10, in11, in12, in13, in14, in15;
+ __m128i input0, input1, input2, input3, input4, input5, input6, input7;
+ __m128i step1_0, step1_1, step1_2, step1_3;
+ __m128i step1_4, step1_5, step1_6, step1_7;
+ __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+ __m128i step3_0, step3_1, step3_2, step3_3;
+ __m128i step3_4, step3_5, step3_6, step3_7;
+ __m128i res00, res01, res02, res03, res04, res05, res06, res07;
+ __m128i res08, res09, res10, res11, res12, res13, res14, res15;
+ // Load and pre-condition input.
+ if (0 == pass) {
+ in00 = _mm_loadu_si128((const __m128i *)(in + 0 * stride));
+ in01 = _mm_loadu_si128((const __m128i *)(in + 1 * stride));
+ in02 = _mm_loadu_si128((const __m128i *)(in + 2 * stride));
+ in03 = _mm_loadu_si128((const __m128i *)(in + 3 * stride));
+ in04 = _mm_loadu_si128((const __m128i *)(in + 4 * stride));
+ in05 = _mm_loadu_si128((const __m128i *)(in + 5 * stride));
+ in06 = _mm_loadu_si128((const __m128i *)(in + 6 * stride));
+ in07 = _mm_loadu_si128((const __m128i *)(in + 7 * stride));
+ in08 = _mm_loadu_si128((const __m128i *)(in + 8 * stride));
+ in09 = _mm_loadu_si128((const __m128i *)(in + 9 * stride));
+ in10 = _mm_loadu_si128((const __m128i *)(in + 10 * stride));
+ in11 = _mm_loadu_si128((const __m128i *)(in + 11 * stride));
+ in12 = _mm_loadu_si128((const __m128i *)(in + 12 * stride));
+ in13 = _mm_loadu_si128((const __m128i *)(in + 13 * stride));
+ in14 = _mm_loadu_si128((const __m128i *)(in + 14 * stride));
+ in15 = _mm_loadu_si128((const __m128i *)(in + 15 * stride));
+ // x = x << 2
+ in00 = _mm_slli_epi16(in00, 2);
+ in01 = _mm_slli_epi16(in01, 2);
+ in02 = _mm_slli_epi16(in02, 2);
+ in03 = _mm_slli_epi16(in03, 2);
+ in04 = _mm_slli_epi16(in04, 2);
+ in05 = _mm_slli_epi16(in05, 2);
+ in06 = _mm_slli_epi16(in06, 2);
+ in07 = _mm_slli_epi16(in07, 2);
+ in08 = _mm_slli_epi16(in08, 2);
+ in09 = _mm_slli_epi16(in09, 2);
+ in10 = _mm_slli_epi16(in10, 2);
+ in11 = _mm_slli_epi16(in11, 2);
+ in12 = _mm_slli_epi16(in12, 2);
+ in13 = _mm_slli_epi16(in13, 2);
+ in14 = _mm_slli_epi16(in14, 2);
+ in15 = _mm_slli_epi16(in15, 2);
+ } else {
+ in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 16));
+ in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 16));
+ in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 16));
+ in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 16));
+ in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 16));
+ in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 16));
+ in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 16));
+ in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 16));
+ in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 16));
+ in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 16));
+ in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 16));
+ in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 16));
+ in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 16));
+ in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 16));
+ in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 16));
+ in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 16));
+ // x = (x + 1) >> 2
+ in00 = _mm_add_epi16(in00, kOne);
+ in01 = _mm_add_epi16(in01, kOne);
+ in02 = _mm_add_epi16(in02, kOne);
+ in03 = _mm_add_epi16(in03, kOne);
+ in04 = _mm_add_epi16(in04, kOne);
+ in05 = _mm_add_epi16(in05, kOne);
+ in06 = _mm_add_epi16(in06, kOne);
+ in07 = _mm_add_epi16(in07, kOne);
+ in08 = _mm_add_epi16(in08, kOne);
+ in09 = _mm_add_epi16(in09, kOne);
+ in10 = _mm_add_epi16(in10, kOne);
+ in11 = _mm_add_epi16(in11, kOne);
+ in12 = _mm_add_epi16(in12, kOne);
+ in13 = _mm_add_epi16(in13, kOne);
+ in14 = _mm_add_epi16(in14, kOne);
+ in15 = _mm_add_epi16(in15, kOne);
+ in00 = _mm_srai_epi16(in00, 2);
+ in01 = _mm_srai_epi16(in01, 2);
+ in02 = _mm_srai_epi16(in02, 2);
+ in03 = _mm_srai_epi16(in03, 2);
+ in04 = _mm_srai_epi16(in04, 2);
+ in05 = _mm_srai_epi16(in05, 2);
+ in06 = _mm_srai_epi16(in06, 2);
+ in07 = _mm_srai_epi16(in07, 2);
+ in08 = _mm_srai_epi16(in08, 2);
+ in09 = _mm_srai_epi16(in09, 2);
+ in10 = _mm_srai_epi16(in10, 2);
+ in11 = _mm_srai_epi16(in11, 2);
+ in12 = _mm_srai_epi16(in12, 2);
+ in13 = _mm_srai_epi16(in13, 2);
+ in14 = _mm_srai_epi16(in14, 2);
+ in15 = _mm_srai_epi16(in15, 2);
+ }
+ in += 8;
+ // Calculate input for the first 8 results.
+ {
+ input0 = _mm_add_epi16(in00, in15);
+ input1 = _mm_add_epi16(in01, in14);
+ input2 = _mm_add_epi16(in02, in13);
+ input3 = _mm_add_epi16(in03, in12);
+ input4 = _mm_add_epi16(in04, in11);
+ input5 = _mm_add_epi16(in05, in10);
+ input6 = _mm_add_epi16(in06, in09);
+ input7 = _mm_add_epi16(in07, in08);
+ }
+ // Calculate input for the next 8 results.
+ {
+ step1_0 = _mm_sub_epi16(in07, in08);
+ step1_1 = _mm_sub_epi16(in06, in09);
+ step1_2 = _mm_sub_epi16(in05, in10);
+ step1_3 = _mm_sub_epi16(in04, in11);
+ step1_4 = _mm_sub_epi16(in03, in12);
+ step1_5 = _mm_sub_epi16(in02, in13);
+ step1_6 = _mm_sub_epi16(in01, in14);
+ step1_7 = _mm_sub_epi16(in00, in15);
+ }
+ // Work on the first eight values; fdct8_1d(input, even_results);
+ {
+ // Add/substract
+ const __m128i q0 = _mm_add_epi16(input0, input7);
+ const __m128i q1 = _mm_add_epi16(input1, input6);
+ const __m128i q2 = _mm_add_epi16(input2, input5);
+ const __m128i q3 = _mm_add_epi16(input3, input4);
+ const __m128i q4 = _mm_sub_epi16(input3, input4);
+ const __m128i q5 = _mm_sub_epi16(input2, input5);
+ const __m128i q6 = _mm_sub_epi16(input1, input6);
+ const __m128i q7 = _mm_sub_epi16(input0, input7);
+ // Work on first four results
+ {
+ // Add/substract
+ const __m128i r0 = _mm_add_epi16(q0, q3);
+ const __m128i r1 = _mm_add_epi16(q1, q2);
+ const __m128i r2 = _mm_sub_epi16(q1, q2);
+ const __m128i r3 = _mm_sub_epi16(q0, q3);
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+ const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+ const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+ const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+ const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+ const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+ const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+ const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ // Combine
+ res00 = _mm_packs_epi32(w0, w1);
+ res08 = _mm_packs_epi32(w2, w3);
+ res04 = _mm_packs_epi32(w4, w5);
+ res12 = _mm_packs_epi32(w6, w7);
+ }
+ // Work on next four results
+ {
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+ const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+ const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+ const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+ const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+ const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+ const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+ const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+ const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+ const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+ const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+ const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+ const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+ // Combine
+ const __m128i r0 = _mm_packs_epi32(s0, s1);
+ const __m128i r1 = _mm_packs_epi32(s2, s3);
+ // Add/substract
+ const __m128i x0 = _mm_add_epi16(q4, r0);
+ const __m128i x1 = _mm_sub_epi16(q4, r0);
+ const __m128i x2 = _mm_sub_epi16(q7, r1);
+ const __m128i x3 = _mm_add_epi16(q7, r1);
+ // Interleave to do the multiply by constants which gets us
+ // into 32 bits.
+ const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+ const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+ const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+ const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+ const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+ const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+ const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+ const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ // Combine
+ res02 = _mm_packs_epi32(w0, w1);
+ res14 = _mm_packs_epi32(w2, w3);
+ res10 = _mm_packs_epi32(w4, w5);
+ res06 = _mm_packs_epi32(w6, w7);
+ }
+ }
+ // Work on the next eight values; step1 -> odd_results
+ {
+ // step 2
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
+ const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
+ const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
+ const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+ const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16);
+ const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ // Combine
+ step2_2 = _mm_packs_epi32(w0, w1);
+ step2_3 = _mm_packs_epi32(w2, w3);
+ }
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
+ const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
+ const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
+ const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+ const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16);
+ const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ // Combine
+ step2_5 = _mm_packs_epi32(w0, w1);
+ step2_4 = _mm_packs_epi32(w2, w3);
+ }
+ // step 3
+ {
+ step3_0 = _mm_add_epi16(step1_0, step2_3);
+ step3_1 = _mm_add_epi16(step1_1, step2_2);
+ step3_2 = _mm_sub_epi16(step1_1, step2_2);
+ step3_3 = _mm_sub_epi16(step1_0, step2_3);
+ step3_4 = _mm_sub_epi16(step1_7, step2_4);
+ step3_5 = _mm_sub_epi16(step1_6, step2_5);
+ step3_6 = _mm_add_epi16(step1_6, step2_5);
+ step3_7 = _mm_add_epi16(step1_7, step2_4);
+ }
+ // step 4
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
+ const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
+ const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
+ const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
+ const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08);
+ const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ // Combine
+ step2_1 = _mm_packs_epi32(w0, w1);
+ step2_2 = _mm_packs_epi32(w2, w3);
+ }
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
+ const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
+ const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
+ const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
+ const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+ const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ // Combine
+ step2_6 = _mm_packs_epi32(w0, w1);
+ step2_5 = _mm_packs_epi32(w2, w3);
+ }
+ // step 5
+ {
+ step1_0 = _mm_add_epi16(step3_0, step2_1);
+ step1_1 = _mm_sub_epi16(step3_0, step2_1);
+ step1_2 = _mm_sub_epi16(step3_3, step2_2);
+ step1_3 = _mm_add_epi16(step3_3, step2_2);
+ step1_4 = _mm_add_epi16(step3_4, step2_5);
+ step1_5 = _mm_sub_epi16(step3_4, step2_5);
+ step1_6 = _mm_sub_epi16(step3_7, step2_6);
+ step1_7 = _mm_add_epi16(step3_7, step2_6);
+ }
+ // step 6
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
+ const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
+ const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
+ const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02);
+ const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18);
+ const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ // Combine
+ res01 = _mm_packs_epi32(w0, w1);
+ res09 = _mm_packs_epi32(w2, w3);
+ }
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
+ const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
+ const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
+ const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10);
+ const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26);
+ const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ // Combine
+ res05 = _mm_packs_epi32(w0, w1);
+ res13 = _mm_packs_epi32(w2, w3);
+ }
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
+ const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
+ const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
+ const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22);
+ const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06);
+ const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ // Combine
+ res11 = _mm_packs_epi32(w0, w1);
+ res03 = _mm_packs_epi32(w2, w3);
+ }
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
+ const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
+ const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
+ const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30);
+ const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14);
+ const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ // Combine
+ res15 = _mm_packs_epi32(w0, w1);
+ res07 = _mm_packs_epi32(w2, w3);
+ }
+ }
+ // Transpose the results, do it as two 8x8 transposes.
+ {
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 54 54 55 55 56 56 57 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 21 36
+ // 44 54 64 74 45 55 61 76
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+ const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+ const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+ const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+ const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ _mm_storeu_si128 ((__m128i *)(out + 0 * 16), tr2_0);
+ _mm_storeu_si128 ((__m128i *)(out + 1 * 16), tr2_1);
+ _mm_storeu_si128 ((__m128i *)(out + 2 * 16), tr2_2);
+ _mm_storeu_si128 ((__m128i *)(out + 3 * 16), tr2_3);
+ _mm_storeu_si128 ((__m128i *)(out + 4 * 16), tr2_4);
+ _mm_storeu_si128 ((__m128i *)(out + 5 * 16), tr2_5);
+ _mm_storeu_si128 ((__m128i *)(out + 6 * 16), tr2_6);
+ _mm_storeu_si128 ((__m128i *)(out + 7 * 16), tr2_7);
+ }
+ {
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 54 54 55 55 56 56 57 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 21 36
+ // 44 54 64 74 45 55 61 76
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+ const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+ const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+ const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+ const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ // Store results
+ _mm_storeu_si128 ((__m128i *)(out + 8 + 0 * 16), tr2_0);
+ _mm_storeu_si128 ((__m128i *)(out + 8 + 1 * 16), tr2_1);
+ _mm_storeu_si128 ((__m128i *)(out + 8 + 2 * 16), tr2_2);
+ _mm_storeu_si128 ((__m128i *)(out + 8 + 3 * 16), tr2_3);
+ _mm_storeu_si128 ((__m128i *)(out + 8 + 4 * 16), tr2_4);
+ _mm_storeu_si128 ((__m128i *)(out + 8 + 5 * 16), tr2_5);
+ _mm_storeu_si128 ((__m128i *)(out + 8 + 6 * 16), tr2_6);
+ _mm_storeu_si128 ((__m128i *)(out + 8 + 7 * 16), tr2_7);
+ }
+ out += 8*16;
+ }
+ // Setup in/out for next pass.
+ in = intermediate;
+ out = output;
+ }
+}