summaryrefslogtreecommitdiff
path: root/vp9
diff options
context:
space:
mode:
Diffstat (limited to 'vp9')
-rw-r--r--vp9/common/vp9_blockd.h4
-rw-r--r--vp9/common/vp9_common_data.c2
-rw-r--r--vp9/common/vp9_common_data.h1
-rw-r--r--vp9/common/vp9_onyxc_int.h2
-rw-r--r--vp9/common/vp9_reconinter.h32
-rw-r--r--vp9/common/x86/vp9_asm_stubs.c100
-rw-r--r--vp9/common/x86/vp9_idct_intrin_sse2.c286
-rw-r--r--vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c591
-rw-r--r--vp9/encoder/vp9_encodeframe.c3
-rw-r--r--vp9/encoder/vp9_firstpass.c76
-rw-r--r--vp9/encoder/vp9_firstpass.h1
-rw-r--r--vp9/encoder/vp9_onyx_if.c554
-rw-r--r--vp9/encoder/vp9_onyx_int.h143
-rw-r--r--vp9/encoder/vp9_ratectrl.c143
-rw-r--r--vp9/encoder/vp9_rdopt.c89
-rw-r--r--vp9/encoder/vp9_rdopt.h9
-rw-r--r--vp9/vp9_common.mk1
-rw-r--r--vp9/vp9_dx_iface.c125
18 files changed, 768 insertions, 1394 deletions
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index c6af7f6da..21e2b16a4 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -115,10 +115,6 @@ static INLINE int mi_width_log2(BLOCK_SIZE sb_type) {
return mi_width_log2_lookup[sb_type];
}
-static INLINE int mi_height_log2(BLOCK_SIZE sb_type) {
- return mi_height_log2_lookup[sb_type];
-}
-
// This structure now relates to 8x8 block regions.
typedef struct {
MB_PREDICTION_MODE mode, uv_mode;
diff --git a/vp9/common/vp9_common_data.c b/vp9/common/vp9_common_data.c
index 886c0afc6..a927823e0 100644
--- a/vp9/common/vp9_common_data.c
+++ b/vp9/common/vp9_common_data.c
@@ -26,8 +26,6 @@ const int mi_width_log2_lookup[BLOCK_SIZES] =
{0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3};
const int num_8x8_blocks_wide_lookup[BLOCK_SIZES] =
{1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8};
-const int mi_height_log2_lookup[BLOCK_SIZES] =
- {0, 0, 0, 0, 1, 0, 1, 2, 1, 2, 3, 2, 3};
const int num_8x8_blocks_high_lookup[BLOCK_SIZES] =
{1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8};
diff --git a/vp9/common/vp9_common_data.h b/vp9/common/vp9_common_data.h
index a367c65c6..5222d29c1 100644
--- a/vp9/common/vp9_common_data.h
+++ b/vp9/common/vp9_common_data.h
@@ -16,7 +16,6 @@
extern const int b_width_log2_lookup[BLOCK_SIZES];
extern const int b_height_log2_lookup[BLOCK_SIZES];
extern const int mi_width_log2_lookup[BLOCK_SIZES];
-extern const int mi_height_log2_lookup[BLOCK_SIZES];
extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZES];
extern const int num_8x8_blocks_high_lookup[BLOCK_SIZES];
extern const int num_4x4_blocks_high_lookup[BLOCK_SIZES];
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index b5ed959e1..f6fe4d3f1 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -346,7 +346,7 @@ static INLINE int partition_plane_context(
const int bs = 1 << bsl;
int above = 0, left = 0, i;
- assert(mi_width_log2(bsize) == mi_height_log2(bsize));
+ assert(b_width_log2(bsize) == b_height_log2(bsize));
assert(bsl >= 0);
for (i = 0; i < bs; i++) {
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index 9f379399c..3cc16d94e 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -58,34 +58,34 @@ static void setup_pred_plane(struct buf_2d *dst,
static void setup_dst_planes(MACROBLOCKD *xd,
const YV12_BUFFER_CONFIG *src,
int mi_row, int mi_col) {
- uint8_t *buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
- src->alpha_buffer};
- int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
- src->alpha_stride};
+ uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
+ src->alpha_buffer};
+ const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
+ src->alpha_stride};
int i;
for (i = 0; i < MAX_MB_PLANE; ++i) {
- struct macroblockd_plane *pd = &xd->plane[i];
+ struct macroblockd_plane *const pd = &xd->plane[i];
setup_pred_plane(&pd->dst, buffers[i], strides[i], mi_row, mi_col, NULL,
pd->subsampling_x, pd->subsampling_y);
}
}
-static void setup_pre_planes(MACROBLOCKD *xd, int i,
+static void setup_pre_planes(MACROBLOCKD *xd, int idx,
const YV12_BUFFER_CONFIG *src,
int mi_row, int mi_col,
const struct scale_factors *sf) {
- if (src) {
- int j;
- uint8_t* buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
- src->alpha_buffer};
- int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
- src->alpha_stride};
+ if (src != NULL) {
+ int i;
+ uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
+ src->alpha_buffer};
+ const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
+ src->alpha_stride};
- for (j = 0; j < MAX_MB_PLANE; ++j) {
- struct macroblockd_plane *pd = &xd->plane[j];
- setup_pred_plane(&pd->pre[i], buffers[j], strides[j],
- mi_row, mi_col, sf, pd->subsampling_x, pd->subsampling_y);
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ struct macroblockd_plane *const pd = &xd->plane[i];
+ setup_pred_plane(&pd->pre[idx], buffers[i], strides[i], mi_row, mi_col,
+ sf, pd->subsampling_x, pd->subsampling_y);
}
}
}
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index f4f758297..f95423678 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -23,105 +23,20 @@ typedef void filter8_1dfunction (
const short *filter
);
-#if (HAVE_SSSE3)
+#if HAVE_SSSE3
+filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
-#if (ARCH_X86_64)
-filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
-
-void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- /* Ensure the filter can be compressed to int16_t. */
- if (x_step_q4 == 16 && filter_x[3] != 128) {
- while (w >= 16) {
- vp9_filter_block1d16_h8_intrin_ssse3(src, src_stride,
- dst, dst_stride,
- h, filter_x);
- src += 16;
- dst += 16;
- w -= 16;
- }
- while (w >= 8) {
- vp9_filter_block1d8_h8_intrin_ssse3(src, src_stride,
- dst, dst_stride,
- h, filter_x);
- src += 8;
- dst += 8;
- w -= 8;
- }
- while (w >= 4) {
- vp9_filter_block1d4_h8_intrin_ssse3(src, src_stride,
- dst, dst_stride,
- h, filter_x);
- src += 4;
- dst += 4;
- w -= 4;
- }
- }
- if (w) {
- vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- }
-}
-void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- if (y_step_q4 == 16 && filter_y[3] != 128) {
- while (w >= 16) {
- vp9_filter_block1d16_v8_intrin_ssse3(src - src_stride * 3, src_stride,
- dst, dst_stride,
- h, filter_y);
- src += 16;
- dst += 16;
- w -= 16;
- }
- while (w >= 8) {
- vp9_filter_block1d8_v8_intrin_ssse3(src - src_stride * 3, src_stride,
- dst, dst_stride,
- h, filter_y);
- src += 8;
- dst += 8;
- w -= 8;
- }
- while (w >= 4) {
- vp9_filter_block1d4_v8_intrin_ssse3(src - src_stride * 3, src_stride,
- dst, dst_stride,
- h, filter_y);
- src += 4;
- dst += 4;
- w -= 4;
- }
- }
- if (w) {
- vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- }
-}
-
-#else
-filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
@@ -198,7 +113,6 @@ void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
w, h);
}
}
-#endif
void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index 501bed5a8..2f6149464 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -380,17 +380,13 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
}
-#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \
+#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
{ \
const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
- const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
- const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
\
in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \
in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \
- in2 = _mm_unpacklo_epi32(tr0_2, tr0_3); /* i5 i4 */ \
- in3 = _mm_unpackhi_epi32(tr0_2, tr0_3); /* i7 i6 */ \
}
#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
@@ -662,7 +658,6 @@ static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
}
static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
- const __m128i zero = _mm_setzero_si128();
const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
@@ -677,7 +672,6 @@ static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
- out[4] = out[5] = out[6] = out[7] = zero;
}
static void idct8_1d_sse2(__m128i *in) {
@@ -1270,6 +1264,114 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
stp2_10, stp2_13, stp2_11, stp2_12) \
}
+#define IDCT16_10_1D \
+ /* Stage2 */ \
+ { \
+ const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
+ const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
+ const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
+ const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
+ \
+ MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
+ stg2_0, stg2_1, stg2_6, stg2_7, \
+ stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
+ } \
+ \
+ /* Stage3 */ \
+ { \
+ const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
+ const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
+ stg3_0, stg3_1, \
+ stp2_4, stp2_7) \
+ \
+ stp1_9 = stp1_8_0; \
+ stp1_10 = stp1_11; \
+ \
+ stp1_13 = stp1_12_0; \
+ stp1_14 = stp1_15; \
+ } \
+ \
+ /* Stage4 */ \
+ { \
+ const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
+ const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
+ \
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ \
+ MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
+ stg4_0, stg4_1, \
+ stp1_0, stp1_1) \
+ stp2_5 = stp2_4; \
+ stp2_6 = stp2_7; \
+ \
+ MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
+ stg4_4, stg4_5, stg4_6, stg4_7, \
+ stp2_9, stp2_14, stp2_10, stp2_13) \
+ } \
+ \
+ /* Stage5 */ \
+ { \
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ \
+ stp1_2 = stp1_1; \
+ stp1_3 = stp1_0; \
+ \
+ tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+ tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+ tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+ tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ \
+ stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
+ stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
+ \
+ stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
+ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+ stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+ stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
+ } \
+ \
+ /* Stage6 */ \
+ { \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+ const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+ \
+ stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
+ stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+ stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+ stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
+ stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
+ stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+ stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
+ \
+ MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+ stg6_0, stg4_0, stg6_0, stg4_0, \
+ stp2_10, stp2_13, stp2_11, stp2_12) \
+ }
+
void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
@@ -2433,22 +2535,14 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
- const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
- const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
- const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
@@ -2456,119 +2550,72 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
__m128i in[16], l[16];
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
stp1_8_0, stp1_12_0;
__m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
- stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int i;
- in[4] = in[5] = in[6] = in[7] = in[12] = in[13] = in[14] = in[15] = zero;
- // 1-D idct. Load input data.
+ // First 1-D inverse DCT
+ // Load input data.
in[0] = _mm_load_si128((const __m128i *)input);
- in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
- in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
- in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
- in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
- TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1], in[2], in[3]);
- TRANSPOSE_8X4(in[8], in[9], in[10], in[11], in[8], in[9], in[10], in[11]);
+ TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
// Stage2
{
- const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], in[11]);
- const __m128i lo_9_7 = _mm_unpackhi_epi16(in[8], in[3]);
- const __m128i lo_5_11 = _mm_unpackhi_epi16(in[2], in[9]);
- const __m128i lo_13_3 = _mm_unpackhi_epi16(in[10], in[1]);
+ const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
+ const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
- tmp4 = _mm_madd_epi16(lo_9_7, stg2_2);
- tmp6 = _mm_madd_epi16(lo_9_7, stg2_3);
- tmp1 = _mm_madd_epi16(lo_5_11, stg2_4);
- tmp3 = _mm_madd_epi16(lo_5_11, stg2_5);
tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
tmp0 = _mm_add_epi32(tmp0, rounding);
tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp4 = _mm_add_epi32(tmp4, rounding);
- tmp6 = _mm_add_epi32(tmp6, rounding);
- tmp1 = _mm_add_epi32(tmp1, rounding);
- tmp3 = _mm_add_epi32(tmp3, rounding);
tmp5 = _mm_add_epi32(tmp5, rounding);
tmp7 = _mm_add_epi32(tmp7, rounding);
tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
- stp2_8 = _mm_packs_epi32(tmp0, zero);
- stp2_15 = _mm_packs_epi32(tmp2, zero);
- stp2_9 = _mm_packs_epi32(tmp4, zero);
- stp2_14 = _mm_packs_epi32(tmp6, zero);
-
- stp2_10 = _mm_packs_epi32(tmp1, zero);
- stp2_13 = _mm_packs_epi32(tmp3, zero);
- stp2_11 = _mm_packs_epi32(tmp5, zero);
- stp2_12 = _mm_packs_epi32(tmp7, zero);
+ stp2_8 = _mm_packs_epi32(tmp0, tmp2);
+ stp2_11 = _mm_packs_epi32(tmp5, tmp7);
}
// Stage3
{
- const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], in[11]);
- const __m128i lo_10_6 = _mm_unpacklo_epi16(in[9], in[3]);
+ const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
- tmp4 = _mm_madd_epi16(lo_10_6, stg3_2);
- tmp6 = _mm_madd_epi16(lo_10_6, stg3_3);
tmp0 = _mm_add_epi32(tmp0, rounding);
tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp4 = _mm_add_epi32(tmp4, rounding);
- tmp6 = _mm_add_epi32(tmp6, rounding);
-
tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
- stp1_4 = _mm_packs_epi32(tmp0, zero);
- stp1_7 = _mm_packs_epi32(tmp2, zero);
- stp1_5 = _mm_packs_epi32(tmp4, zero);
- stp1_6 = _mm_packs_epi32(tmp6, zero);
- stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);
- stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
- stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
- stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
+ stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
+ stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
- stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);
- stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
- stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
- stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
+ stp1_4 = _mm_packs_epi32(tmp0, tmp2);
}
// Stage4
{
- const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]);
- const __m128i lo_4_12 = _mm_unpacklo_epi16(in[2], in[10]);
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+ const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
- tmp4 = _mm_madd_epi16(lo_4_12, stg4_2);
- tmp6 = _mm_madd_epi16(lo_4_12, stg4_3);
tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
@@ -2576,8 +2623,6 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
tmp0 = _mm_add_epi32(tmp0, rounding);
tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp4 = _mm_add_epi32(tmp4, rounding);
- tmp6 = _mm_add_epi32(tmp6, rounding);
tmp1 = _mm_add_epi32(tmp1, rounding);
tmp3 = _mm_add_epi32(tmp3, rounding);
tmp5 = _mm_add_epi32(tmp5, rounding);
@@ -2585,49 +2630,40 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
- stp2_0 = _mm_packs_epi32(tmp0, zero);
- stp2_1 = _mm_packs_epi32(tmp2, zero);
- stp2_2 = _mm_packs_epi32(tmp4, zero);
- stp2_3 = _mm_packs_epi32(tmp6, zero);
- stp2_9 = _mm_packs_epi32(tmp1, zero);
- stp2_14 = _mm_packs_epi32(tmp3, zero);
- stp2_10 = _mm_packs_epi32(tmp5, zero);
- stp2_13 = _mm_packs_epi32(tmp7, zero);
-
- stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
- stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
- stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
- stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
+ stp1_0 = _mm_packs_epi32(tmp0, tmp0);
+ stp1_1 = _mm_packs_epi32(tmp2, tmp2);
+ stp2_9 = _mm_packs_epi32(tmp1, tmp3);
+ stp2_10 = _mm_packs_epi32(tmp5, tmp7);
+
+ stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
}
// Stage5 and Stage6
{
- stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
- stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
- stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
- stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
-
- stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);
- stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
- stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
- stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);
-
- stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);
- stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
- stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
- stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);
+ tmp0 = _mm_add_epi16(stp2_8, stp2_11);
+ tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
+ tmp2 = _mm_add_epi16(stp2_9, stp2_10);
+ tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
+
+ stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
+ stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
+ stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
+ stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
+
+ stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
+ stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
+ stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
+ stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
}
// Stage6
{
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
@@ -2652,21 +2688,26 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
- stp1_5 = _mm_packs_epi32(tmp1, zero);
- stp1_6 = _mm_packs_epi32(tmp3, zero);
+ stp1_6 = _mm_packs_epi32(tmp3, tmp1);
+
stp2_10 = _mm_packs_epi32(tmp0, zero);
stp2_13 = _mm_packs_epi32(tmp2, zero);
stp2_11 = _mm_packs_epi32(tmp4, zero);
stp2_12 = _mm_packs_epi32(tmp6, zero);
- stp2_0 = _mm_add_epi16(stp1_0, stp2_7);
- stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
- stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
- stp2_3 = _mm_add_epi16(stp1_3, stp2_4);
- stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);
- stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
- stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
- stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);
+ tmp0 = _mm_add_epi16(stp1_0, stp1_4);
+ tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
+ tmp2 = _mm_add_epi16(stp1_1, stp1_6);
+ tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
+
+ stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
+ stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
+ stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
+ stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
+ stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
+ stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
+ stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
+ stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
}
// Stage7. Left 8x16 only.
@@ -2687,12 +2728,11 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
l[14] = _mm_sub_epi16(stp2_1, stp1_14);
l[15] = _mm_sub_epi16(stp2_0, stp1_15);
- // 2-D idct. We do 2 8x16 blocks.
+ // Second 1-D inverse transform, performed per 8x16 block
for (i = 0; i < 2; i++) {
array_transpose_4X8(l + 8*i, in);
- in[8] = in[9] = in[10] = in[11] = in[12] = in[13] = in[14] = in[15] = zero;
- IDCT16_1D
+ IDCT16_10_1D
// Stage7
in[0] = _mm_add_epi16(stp2_0, stp1_15);
diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
deleted file mode 100644
index 303fced3b..000000000
--- a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
+++ /dev/null
@@ -1,591 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <tmmintrin.h>
-#include "vpx_ports/mem.h"
-#include "vpx_ports/emmintrin_compat.h"
-
-
-// filters only for the 4_h8 convolution
-DECLARE_ALIGNED(16, const unsigned char,
-filt1_4_h8[16])= {0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6};
-
-DECLARE_ALIGNED(16, const unsigned char,
-filt2_4_h8[16])= {4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10};
-
-// filters for 8_h8 and 16_h8
-DECLARE_ALIGNED(16, const unsigned char,
-filt1_global[16])= {0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8};
-
-DECLARE_ALIGNED(16, const unsigned char,
-filt2_global[16])= {2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10};
-
-DECLARE_ALIGNED(16, const unsigned char,
-filt3_global[16])= {4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12};
-
-DECLARE_ALIGNED(16, const unsigned char,
-filt4_global[16])= {6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14};
-
-
-
-void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- int16_t *filter) {
- __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
- __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
- __m128i addFilterReg64, filtersReg, srcReg, minReg;
- unsigned int i;
-
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((__m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
- // duplicate only the first 16 bits in the filter into the first lane
- firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
- // duplicate only the third 16 bit in the filter into the first lane
- secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
- // duplicate only the seconds 16 bits in the filter into the second lane
- firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
- // duplicate only the forth 16 bits in the filter into the second lane
- secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
-
- // loading the local filters
- thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8);
- forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8);
-
- for (i = 0; i < output_height; i++) {
- srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
-
- // filter the source buffer
- srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters);
- srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
- // extract the higher half of the lane
- srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
- srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
-
- minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
-
- // add and saturate all the results together
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
- srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
-
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
-
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
- // shift by 7 bit each 16 bits
- srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
- // shrink to 8 bit each 16 bits
- srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
- src_ptr+=src_pixels_per_line;
-
- // save only 4 bytes
- *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
-
- output_ptr+=output_pitch;
- }
-}
-
-
-void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- int16_t *filter) {
- __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
- __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
- __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
- __m128i addFilterReg64, filtersReg, minReg;
- unsigned int i;
-
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((__m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
- // duplicate only the first 16 bits (first and second byte)
- // across 128 bit register
- firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
- // duplicate only the second 16 bits (third and forth byte)
- // across 128 bit register
- secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
- // duplicate only the third 16 bits (fifth and sixth byte)
- // across 128 bit register
- thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
- // duplicate only the forth 16 bits (seventh and eighth byte)
- // across 128 bit register
- forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
- filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
- filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
- filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
- filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
-
- for (i = 0; i < output_height; i++) {
- srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
-
- // filter the source buffer
- srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
- srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
- // filter the source buffer
- srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
- srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
- srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
-
- // add and saturate all the results together
- minReg = _mm_min_epi16(srcRegFilt4, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-
- srcRegFilt4= _mm_max_epi16(srcRegFilt4, srcRegFilt3);
-
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
- // shift by 7 bit each 16 bits
- srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
- // shrink to 8 bit each 16 bits
- srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
- src_ptr+=src_pixels_per_line;
-
- // save only 8 bytes
- _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
-
- output_ptr+=output_pitch;
- }
-}
-
-void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- int16_t *filter) {
- __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
- __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
- __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
- __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
- unsigned int i;
-
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((__m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
- // duplicate only the first 16 bits (first and second byte)
- // across 128 bit register
- firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
- // duplicate only the second 16 bits (third and forth byte)
- // across 128 bit register
- secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
- // duplicate only the third 16 bits (fifth and sixth byte)
- // across 128 bit register
- thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
- // duplicate only the forth 16 bits (seventh and eighth byte)
- // across 128 bit register
- forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
- filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
- filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
- filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
- filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
-
- for (i = 0; i < output_height; i++) {
- srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
-
- // filter the source buffer
- srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
- srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt2Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
- // add and saturate the results together
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
-
- // filter the source buffer
- srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt4Reg);
- srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-
- // add and saturate the results together
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
- _mm_min_epi16(srcRegFilt3, srcRegFilt2));
-
- // reading the next 16 bytes.
- // (part of it was being read by earlier read)
- srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
-
- // add and saturate the results together
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
- _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
- // filter the source buffer
- srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
- srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt2Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
- // add and saturate the results together
- srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
-
- // filter the source buffer
- srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt4Reg);
- srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-
- // add and saturate the results together
- srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
- _mm_min_epi16(srcRegFilt3, srcRegFilt2));
- srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
- _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
-
- srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);
-
- // shift by 7 bit each 16 bit
- srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
- srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
-
- src_ptr+=src_pixels_per_line;
-
- // save 16 bytes
- _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
-
- output_ptr+=output_pitch;
- }
-}
-
-
-
-void vp9_filter_block1d4_v8_intrin_ssse3(unsigned char *src_ptr,
- unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- int16_t *filter) {
- __m128i addFilterReg64, filtersReg, firstFilters, secondFilters;
- __m128i minReg, srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
- unsigned int i;
-
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((__m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
- // duplicate only the first 16 bits in the filter into the first lane
- firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
- // duplicate only the second 16 bits in the filter into the second lane
- firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
- // duplicate only the third 16 bits in the filter into the first lane
- secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
- // duplicate only the forth 16 bits in the filter into the second lane
- secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
-
- for (i = 0; i < output_height; i++) {
- // load the first 4 byte
- srcRegFilt1 = _mm_cvtsi32_si128(*((int*)&src_ptr[0]));
- // load the next 4 bytes in stride of src_pitch
- srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch)[0]));
-
- // merge the result together
- srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
-
-
- srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*2)[0]));
- srcRegFilt3 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*3)[0]));
-
- // merge the result together
- srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
-
- srcRegFilt3 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*4)[0]));
- srcRegFilt4 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*5)[0]));
-
- // merge the result together
- srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
- srcRegFilt1 = _mm_unpacklo_epi64(srcRegFilt1, srcRegFilt2);
-
- srcRegFilt4 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*6)[0]));
- srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*7)[0]));
-
- // merge the result together
- srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt4, srcRegFilt2);
- srcRegFilt3 = _mm_unpacklo_epi64(srcRegFilt3, srcRegFilt4);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
-
- // extract the second lane of the 128 bit register
- srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8);
-
- // add and saturate the results together
- minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
- _mm_srli_si128(srcRegFilt3, 8));
- srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
- // shift by 7 bit each 16 bit
- srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
- // shrink to 8 bit each 16 bits
- srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
- src_ptr+=src_pitch;
-
- // save only 4 bytes convolve result
- *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
-
- output_ptr+=out_pitch;
- }
-}
-
-void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
- unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- int16_t *filter) {
- __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6;
- __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
- __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5;
- unsigned int i;
-
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((__m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
- // duplicate only the first 16 bits in the filter
- firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
- // duplicate only the second 16 bits in the filter
- secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
- // duplicate only the third 16 bits in the filter
- thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
- // duplicate only the forth 16 bits in the filter
- forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
- for (i = 0; i < output_height; i++) {
- // load the first 8 bytes
- srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
- // load the next 8 bytes in stride of src_pitch
- srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]);
- srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]);
- srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]);
-
- // merge the result together
- srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
- srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
-
- // load the next 8 bytes in stride of src_pitch
- srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]);
- srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]);
- srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]);
- srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]);
-
- // merge the result together
- srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4);
- srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
- srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
-
- // add and saturate the results together
- minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
- srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
- // shift by 7 bit each 16 bit
- srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
- // shrink to 8 bit each 16 bits
- srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
- src_ptr+=src_pitch;
-
- // save only 8 bytes convolve result
- _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
-
- output_ptr+=out_pitch;
- }
-}
-
-
-void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
- unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- int16_t *filter) {
- __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3;
- __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
- __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
- unsigned int i;
-
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((__m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
- // duplicate only the first 16 bits in the filter
- firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
- // duplicate only the second 16 bits in the filter
- secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
- // duplicate only the third 16 bits in the filter
- thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
- // duplicate only the forth 16 bits in the filter
- forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-
- for (i = 0; i < output_height; i++) {
- // load the first 16 bytes
- srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr));
- // load the next 16 bytes in stride of src_pitch
- srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch));
- srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6));
- srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
-
- // merge the result together
- srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
- srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
- srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2);
- srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
- srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
- srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
-
-
- // add and saturate the results together
- srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
-
- // load the next 16 bytes in stride of two/three src_pitch
- srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2));
- srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3));
-
- // merge the result together
- srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
- srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters);
- srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
-
- // load the next 16 bytes in stride of four/five src_pitch
- srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4));
- srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5));
-
- // merge the result together
- srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
- srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
- srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);
-
-
- // add and saturate the results together
- srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
- _mm_min_epi16(srcRegFilt4, srcRegFilt7));
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
- _mm_min_epi16(srcRegFilt6, srcRegFilt8));
-
-
- // add and saturate the results together
- srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
- _mm_max_epi16(srcRegFilt4, srcRegFilt7));
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
- _mm_max_epi16(srcRegFilt6, srcRegFilt8));
- srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
- // shift by 7 bit each 16 bit
- srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
- srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);
-
- src_ptr+=src_pitch;
-
- // save 16 bytes convolve result
- _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
-
- output_ptr+=out_pitch;
- }
-}
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index d17952487..6894f553f 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1923,9 +1923,6 @@ static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
vp9_zero(cpi->mb.pred_mv);
- if (cpi->sf.reference_masking)
- rd_pick_reference_frame(cpi, tile, mi_row, mi_col);
-
if (cpi->sf.use_lastframe_partitioning ||
cpi->sf.use_one_partition_size_always ) {
const int idx_str = cm->mode_info_stride * mi_row + mi_col;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 5317661c0..812ffa96d 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -106,6 +106,7 @@ static int lookup_next_frame_stats(const struct twopass_rc *p,
return 1;
}
+
// Read frame stats at an offset from the current position
static int read_frame_stats(const struct twopass_rc *p,
FIRSTPASS_STATS *frame_stats, int offset) {
@@ -149,7 +150,7 @@ static void output_stats(const VP9_COMP *cpi,
FILE *fpfile;
fpfile = fopen("firstpass.stt", "a");
- fprintf(stdout, "%12.0f %12.0f %12.0f %12.0f %12.0f %12.4f %12.4f"
+ fprintf(fpfile, "%12.0f %12.0f %12.0f %12.0f %12.0f %12.4f %12.4f"
"%12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f"
"%12.0f %12.0f %12.4f %12.0f %12.0f %12.4f\n",
stats->frame,
@@ -349,17 +350,14 @@ static double simple_weight(YV12_BUFFER_CONFIG *source) {
}
-// This function returns the current per frame maximum bitrate target.
+// This function returns the maximum target rate per frame.
static int frame_max_bits(VP9_COMP *cpi) {
- // Max allocation for a single frame based on the max section guidelines
- // passed in and how many bits are left.
- // For VBR base this on the bits and frames left plus the
- // two_pass_vbrmax_section rate passed in by the user.
- const double max_bits = (1.0 * cpi->twopass.bits_left /
- (cpi->twopass.total_stats.count - cpi->common.current_video_frame)) *
- (cpi->oxcf.two_pass_vbrmax_section / 100.0);
+ int64_t max_bits =
+ ((int64_t)cpi->rc.av_per_frame_bandwidth *
+ (int64_t)cpi->oxcf.two_pass_vbrmax_section) / 100;
+
if (max_bits < 0)
- return 0;
+ return 0;
if (max_bits >= INT_MAX)
return INT_MAX;
return (int)max_bits;
@@ -1662,7 +1660,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// Don't allow a gf too near the next kf
if ((cpi->rc.frames_to_key - i) < MIN_GF_INTERVAL) {
- while (i < cpi->rc.frames_to_key) {
+ while (i < (cpi->rc.frames_to_key + !cpi->rc.next_key_frame_forced)) {
i++;
if (EOF == input_stats(&cpi->twopass, this_frame))
@@ -1697,6 +1695,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
if (allow_alt_ref &&
(i < cpi->oxcf.lag_in_frames) &&
(i >= MIN_GF_INTERVAL) &&
+ // for real scene cuts (not forced kfs) dont allow arf very near kf.
+ (cpi->rc.next_key_frame_forced ||
+ (i <= (cpi->rc.frames_to_key - MIN_GF_INTERVAL))) &&
((next_frame.pcnt_inter > 0.75) ||
(next_frame.pcnt_second_ref > 0.5)) &&
((mv_in_out_accumulator / (double)i > -0.2) ||
@@ -1765,18 +1766,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
#endif
#endif
- // Now decide how many bits should be allocated to the GF group as a
- // proportion of those remaining in the kf group.
- // The final key frame group in the clip is treated as a special case
- // where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left.
- // This is also important for short clips where there may only be one
- // key frame.
- if (cpi->rc.frames_to_key >= (int)(cpi->twopass.total_stats.count -
- cpi->common.current_video_frame)) {
- cpi->twopass.kf_group_bits =
- (cpi->twopass.bits_left > 0) ? cpi->twopass.bits_left : 0;
- }
-
// Calculate the bits to be allocated to the group as a whole
if ((cpi->twopass.kf_group_bits > 0) &&
(cpi->twopass.kf_group_error_left > 0)) {
@@ -1863,9 +1852,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
if (gf_bits < 0)
gf_bits = 0;
- // Add in minimum for a frame
- gf_bits += cpi->rc.min_frame_bandwidth;
-
if (i == 0) {
cpi->twopass.gf_bits = gf_bits;
}
@@ -1899,8 +1885,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
cpi->twopass.gf_group_error_left = (int64_t)gf_group_err;
}
- cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits
- - cpi->rc.min_frame_bandwidth;
+ cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits;
if (cpi->twopass.gf_group_bits < 0)
cpi->twopass.gf_group_bits = 0;
@@ -1985,9 +1970,6 @@ static void assign_std_frame_bits(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
if (cpi->twopass.gf_group_bits < 0)
cpi->twopass.gf_group_bits = 0;
- // Add in the minimum number of bits that is set aside for every frame.
- target_frame_size += cpi->rc.min_frame_bandwidth;
-
// Per frame bit target for this frame.
cpi->rc.per_frame_bandwidth = target_frame_size;
}
@@ -2029,6 +2011,22 @@ void vp9_get_one_pass_params(VP9_COMP *cpi) {
}
}
+void vp9_get_one_pass_cbr_params(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ if ((cm->current_video_frame == 0 ||
+ cm->frame_flags & FRAMEFLAGS_KEY ||
+ cpi->rc.frames_to_key == 0 ||
+ (cpi->oxcf.auto_key && test_for_kf_one_pass(cpi)))) {
+ cm->frame_type = KEY_FRAME;
+ cpi->rc.frames_to_key = cpi->key_frame_frequency;
+ } else {
+ cm->frame_type = INTER_FRAME;
+ }
+ // Don't use gf_update by default in CBR mode.
+ cpi->rc.frames_till_gf_update_due = INT_MAX;
+ cpi->rc.baseline_gf_interval = INT_MAX;
+}
+
void vp9_get_first_pass_params(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
if (!cpi->refresh_alt_ref_frame &&
@@ -2038,8 +2036,8 @@ void vp9_get_first_pass_params(VP9_COMP *cpi) {
} else {
cm->frame_type = INTER_FRAME;
}
- cpi->rc.frames_to_key = INT_MAX;
// Do not use periodic key frames
+ cpi->rc.frames_to_key = INT_MAX;
}
void vp9_get_second_pass_params(VP9_COMP *cpi) {
@@ -2265,8 +2263,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
vp9_zero(next_frame);
vp9_clear_system_state(); // __asm emms;
- start_position = cpi->twopass.stats_in;
+ start_position = cpi->twopass.stats_in;
cpi->common.frame_type = KEY_FRAME;
// is this a forced key frame by interval
@@ -2348,7 +2346,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// interval is between 1x and 2x
if (cpi->oxcf.auto_key
&& cpi->rc.frames_to_key > (int)cpi->key_frame_frequency) {
- FIRSTPASS_STATS *current_pos = cpi->twopass.stats_in;
FIRSTPASS_STATS tmp_frame;
cpi->rc.frames_to_key /= 2;
@@ -2373,15 +2370,14 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// Load a the next frame's stats
input_stats(&cpi->twopass, &tmp_frame);
}
-
- // Reset to the start of the group
- reset_fpf_position(&cpi->twopass, current_pos);
-
+ cpi->rc.next_key_frame_forced = 1;
+ } else if (cpi->twopass.stats_in == cpi->twopass.stats_in_end) {
cpi->rc.next_key_frame_forced = 1;
} else {
cpi->rc.next_key_frame_forced = 0;
}
- // Special case for the last frame of the file
+
+ // Special case for the last key frame of the file
if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end) {
// Accumulate kf group error
kf_group_err += calculate_modified_err(cpi, this_frame);
@@ -2566,8 +2562,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
}
cpi->twopass.kf_group_bits -= cpi->twopass.kf_bits;
- // Add in the minimum frame allowance
- cpi->twopass.kf_bits += cpi->rc.min_frame_bandwidth;
// Peer frame bit target for this frame
cpi->rc.per_frame_bandwidth = cpi->twopass.kf_bits;
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 43703c2c5..f89e4cb1c 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -22,6 +22,7 @@ void vp9_end_second_pass(VP9_COMP *cpi);
void vp9_get_first_pass_params(VP9_COMP *cpi);
void vp9_get_one_pass_params(VP9_COMP *cpi);
+void vp9_get_one_pass_cbr_params(VP9_COMP *cpi);
void vp9_get_svc_params(VP9_COMP *cpi);
#endif // VP9_ENCODER_VP9_FIRSTPASS_H_
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 946ed2818..8e60bc96d 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -581,22 +581,21 @@ static void set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi, int mode) {
sf->thresh_mult_sub8x8[THR_COMP_GA] = INT_MAX;
}
-static void set_rt_speed_feature(VP9_COMMON *cm,
- SPEED_FEATURES *sf,
- int speed) {
- sf->static_segmentation = 0;
+static void set_good_speed_feature(VP9_COMMON *cm,
+ SPEED_FEATURES *sf,
+ int speed) {
+ int i;
sf->adaptive_rd_thresh = 1;
sf->recode_loop = (speed < 1);
- if (speed >= 1) {
+ if (speed == 1) {
sf->use_square_partition_only = !frame_is_intra_only(cm);
- sf->less_rectangular_check = 1;
- sf->tx_size_search_method =
- frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
+ sf->less_rectangular_check = 1;
+ sf->tx_size_search_method = frame_is_intra_only(cm)
+ ? USE_FULL_RD : USE_LARGESTALL;
if (MIN(cm->width, cm->height) >= 720)
sf->disable_split_mask = cm->show_frame ?
- DISABLE_ALL_SPLIT :
- DISABLE_ALL_INTER_SPLIT;
+ DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
else
sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
@@ -610,26 +609,26 @@ static void set_rt_speed_feature(VP9_COMMON *cm,
sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
}
- if (speed >= 2) {
+ if (speed == 2) {
sf->use_square_partition_only = !frame_is_intra_only(cm);
- sf->less_rectangular_check = 1;
- sf->tx_size_search_method =
- frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
+ sf->less_rectangular_check = 1;
+ sf->tx_size_search_method = frame_is_intra_only(cm)
+ ? USE_FULL_RD : USE_LARGESTALL;
if (MIN(cm->width, cm->height) >= 720)
sf->disable_split_mask = cm->show_frame ?
- DISABLE_ALL_SPLIT :
- DISABLE_ALL_INTER_SPLIT;
+ DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
else
sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
- sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH
- | FLAG_SKIP_INTRA_BESTINTER | FLAG_SKIP_COMP_BESTINTRA
- | FLAG_SKIP_INTRA_LOWVAR;
-
+ sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+ FLAG_SKIP_INTRA_BESTINTER |
+ FLAG_SKIP_COMP_BESTINTRA |
+ FLAG_SKIP_INTRA_LOWVAR;
sf->use_rd_breakout = 1;
sf->adaptive_motion_search = 1;
sf->adaptive_pred_filter_type = 2;
+ sf->reference_masking = 1;
sf->auto_mv_step_size = 1;
sf->disable_filter_search_var_thresh = 50;
@@ -649,7 +648,7 @@ static void set_rt_speed_feature(VP9_COMMON *cm,
sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
}
- if (speed >= 3) {
+ if (speed == 3) {
sf->use_square_partition_only = 1;
sf->tx_size_search_method = USE_LARGESTALL;
@@ -658,13 +657,15 @@ static void set_rt_speed_feature(VP9_COMMON *cm,
else
sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
- sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH
- | FLAG_SKIP_INTRA_BESTINTER | FLAG_SKIP_COMP_BESTINTRA
- | FLAG_SKIP_INTRA_LOWVAR;
+ sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+ FLAG_SKIP_INTRA_BESTINTER |
+ FLAG_SKIP_COMP_BESTINTRA |
+ FLAG_SKIP_INTRA_LOWVAR;
sf->use_rd_breakout = 1;
sf->adaptive_motion_search = 1;
sf->adaptive_pred_filter_type = 2;
+ sf->reference_masking = 1;
sf->auto_mv_step_size = 1;
sf->disable_filter_search_var_thresh = 100;
@@ -684,19 +685,22 @@ static void set_rt_speed_feature(VP9_COMMON *cm,
sf->adaptive_rd_thresh = 4;
sf->mode_skip_start = 6;
}
- if (speed >= 4) {
+ if (speed == 4) {
sf->use_square_partition_only = 1;
sf->tx_size_search_method = USE_LARGESTALL;
sf->disable_split_mask = DISABLE_ALL_SPLIT;
- sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH
- | FLAG_SKIP_INTRA_BESTINTER | FLAG_SKIP_COMP_BESTINTRA
- | FLAG_SKIP_COMP_REFMISMATCH | FLAG_SKIP_INTRA_LOWVAR
- | FLAG_EARLY_TERMINATE;
+ sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+ FLAG_SKIP_INTRA_BESTINTER |
+ FLAG_SKIP_COMP_BESTINTRA |
+ FLAG_SKIP_COMP_REFMISMATCH |
+ FLAG_SKIP_INTRA_LOWVAR |
+ FLAG_EARLY_TERMINATE;
sf->use_rd_breakout = 1;
sf->adaptive_motion_search = 1;
sf->adaptive_pred_filter_type = 2;
+ sf->reference_masking = 1;
sf->auto_mv_step_size = 1;
sf->disable_filter_search_var_thresh = 200;
@@ -715,30 +719,24 @@ static void set_rt_speed_feature(VP9_COMMON *cm,
sf->adaptive_rd_thresh = 4;
sf->mode_skip_start = 6;
-
- /* sf->intra_y_mode_mask = INTRA_DC_ONLY;
- sf->intra_uv_mode_mask = INTRA_DC_ONLY;
- sf->search_method = BIGDIA;
- sf->disable_split_var_thresh = 64;
- sf->disable_filter_search_var_thresh = 64; */
}
- if (speed >= 5) {
- int i;
+ if (speed == 5) {
sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
sf->use_one_partition_size_always = 1;
sf->always_this_block_size = BLOCK_16X16;
- sf->tx_size_search_method =
- frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
- sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH
- | FLAG_SKIP_INTRA_BESTINTER | FLAG_SKIP_COMP_BESTINTRA
- | FLAG_SKIP_COMP_REFMISMATCH | FLAG_SKIP_INTRA_LOWVAR
- | FLAG_EARLY_TERMINATE;
+ sf->tx_size_search_method = frame_is_intra_only(cm) ?
+ USE_FULL_RD : USE_LARGESTALL;
+ sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+ FLAG_SKIP_INTRA_BESTINTER |
+ FLAG_SKIP_COMP_BESTINTRA |
+ FLAG_SKIP_COMP_REFMISMATCH |
+ FLAG_SKIP_INTRA_LOWVAR |
+ FLAG_EARLY_TERMINATE;
sf->use_rd_breakout = 1;
sf->use_lp32x32fdct = 1;
sf->optimize_coefficients = 0;
sf->auto_mv_step_size = 1;
- // sf->reduce_first_step_size = 1;
- // sf->reference_masking = 1;
+ sf->reference_masking = 1;
sf->disable_split_mask = DISABLE_ALL_SPLIT;
sf->search_method = HEX;
@@ -754,6 +752,107 @@ static void set_rt_speed_feature(VP9_COMMON *cm,
sf->mode_skip_start = 6;
}
}
+static void set_rt_speed_feature(VP9_COMMON *cm,
+ SPEED_FEATURES *sf,
+ int speed) {
+ sf->static_segmentation = 0;
+ sf->adaptive_rd_thresh = 1;
+ sf->recode_loop = (speed < 1);
+ if (speed == 1) {
+ sf->use_square_partition_only = !frame_is_intra_only(cm);
+ sf->less_rectangular_check = 1;
+ sf->tx_size_search_method =
+ frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
+
+ if (MIN(cm->width, cm->height) >= 720)
+ sf->disable_split_mask = cm->show_frame ?
+ DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+ else
+ sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+
+ sf->use_rd_breakout = 1;
+ sf->adaptive_motion_search = 1;
+ sf->adaptive_pred_filter_type = 1;
+ sf->auto_mv_step_size = 1;
+ sf->adaptive_rd_thresh = 2;
+ sf->recode_loop = 2;
+ sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+ sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+ sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+ }
+ if (speed >= 2) {
+ sf->use_square_partition_only = !frame_is_intra_only(cm);
+ sf->less_rectangular_check = 1;
+ sf->tx_size_search_method =
+ frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
+
+ if (MIN(cm->width, cm->height) >= 720)
+ sf->disable_split_mask = cm->show_frame ?
+ DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+ else
+ sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+
+ sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH
+ | FLAG_SKIP_INTRA_BESTINTER | FLAG_SKIP_COMP_BESTINTRA
+ | FLAG_SKIP_INTRA_LOWVAR;
+
+ sf->use_rd_breakout = 1;
+ sf->adaptive_motion_search = 1;
+ sf->adaptive_pred_filter_type = 2;
+ sf->auto_mv_step_size = 1;
+ sf->reference_masking = 1;
+
+ sf->disable_filter_search_var_thresh = 50;
+ sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+
+ sf->auto_min_max_partition_size = 1;
+ sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
+ sf->adjust_partitioning_from_last_frame = 1;
+ sf->last_partitioning_redo_frequency = 3;
+
+ sf->adaptive_rd_thresh = 2;
+ sf->recode_loop = 2;
+ sf->use_lp32x32fdct = 1;
+ sf->mode_skip_start = 11;
+ sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+ sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+ sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+ sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+ }
+ if (speed >= 3) {
+ sf->use_square_partition_only = 1;
+ sf->tx_size_search_method = USE_LARGESTALL;
+
+ if (MIN(cm->width, cm->height) >= 720)
+ sf->disable_split_mask = DISABLE_ALL_SPLIT;
+ else
+ sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
+
+ sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH
+ | FLAG_SKIP_INTRA_BESTINTER | FLAG_SKIP_COMP_BESTINTRA
+ | FLAG_SKIP_INTRA_LOWVAR;
+
+ sf->disable_filter_search_var_thresh = 100;
+ sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
+ sf->use_uv_intra_rd_estimate = 1;
+ sf->skip_encode_sb = 1;
+ sf->subpel_iters_per_step = 1;
+ sf->use_fast_coef_updates = 2;
+ sf->adaptive_rd_thresh = 4;
+ sf->mode_skip_start = 6;
+ }
+ if (speed >= 4) {
+ sf->optimize_coefficients = 0;
+ }
+ if (speed >= 5) {
+ int i;
+ sf->disable_split_mask = DISABLE_ALL_SPLIT;
+ for (i = 0; i < TX_SIZES; i++) {
+ sf->intra_y_mode_mask[i] = INTRA_DC_H_V;
+ sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
+ }
+ }
+}
void vp9_set_speed_features(VP9_COMP *cpi) {
SPEED_FEATURES *sf = &cpi->sf;
@@ -772,7 +871,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
// best quality defaults
sf->RD = 1;
sf->search_method = NSTEP;
- sf->auto_filter = 1;
sf->recode_loop = 1;
sf->subpel_search_method = SUBPEL_TREE;
sf->subpel_iters_per_step = 2;
@@ -812,199 +910,13 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->using_small_partition_info = 0;
sf->mode_skip_start = MAX_MODES; // Mode index at which mode skip mask set
-#if CONFIG_MULTIPLE_ARF
- // Switch segmentation off.
- sf->static_segmentation = 0;
-#else
- sf->static_segmentation = 0;
-#endif
-
switch (mode) {
case 0: // This is the best quality mode.
cpi->diamond_search_sad = vp9_full_range_search;
break;
-
case 1:
-
-#if CONFIG_MULTIPLE_ARF
- // Switch segmentation off.
- sf->static_segmentation = 0;
-#else
- sf->static_segmentation = 0;
-#endif
- sf->adaptive_rd_thresh = 1;
- sf->recode_loop = (speed < 1);
-
- if (speed == 1) {
- sf->use_square_partition_only = !frame_is_intra_only(cm);
- sf->less_rectangular_check = 1;
- sf->tx_size_search_method = frame_is_intra_only(cm)
- ? USE_FULL_RD : USE_LARGESTALL;
-
- if (MIN(cm->width, cm->height) >= 720)
- sf->disable_split_mask = cm->show_frame ?
- DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
- else
- sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
-
- sf->use_rd_breakout = 1;
- sf->adaptive_motion_search = 1;
- sf->adaptive_pred_filter_type = 1;
- sf->auto_mv_step_size = 1;
- sf->adaptive_rd_thresh = 2;
- sf->recode_loop = 2;
- sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
- sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
- sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
- }
- if (speed == 2) {
- sf->use_square_partition_only = !frame_is_intra_only(cm);
- sf->less_rectangular_check = 1;
- sf->tx_size_search_method = frame_is_intra_only(cm)
- ? USE_FULL_RD : USE_LARGESTALL;
-
- if (MIN(cm->width, cm->height) >= 720)
- sf->disable_split_mask = cm->show_frame ?
- DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
- else
- sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
-
-
- sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
- FLAG_SKIP_INTRA_BESTINTER |
- FLAG_SKIP_COMP_BESTINTRA |
- FLAG_SKIP_INTRA_LOWVAR;
-
- sf->use_rd_breakout = 1;
- sf->adaptive_motion_search = 1;
- sf->adaptive_pred_filter_type = 2;
- sf->auto_mv_step_size = 1;
-
- sf->disable_filter_search_var_thresh = 50;
- sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-
- sf->auto_min_max_partition_size = 1;
- sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
- sf->adjust_partitioning_from_last_frame = 1;
- sf->last_partitioning_redo_frequency = 3;
-
- sf->adaptive_rd_thresh = 2;
- sf->recode_loop = 2;
- sf->use_lp32x32fdct = 1;
- sf->mode_skip_start = 11;
- sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
- sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
- sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
- sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
- }
- if (speed == 3) {
- sf->use_square_partition_only = 1;
- sf->tx_size_search_method = USE_LARGESTALL;
-
- if (MIN(cm->width, cm->height) >= 720)
- sf->disable_split_mask = DISABLE_ALL_SPLIT;
- else
- sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
-
- sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
- FLAG_SKIP_INTRA_BESTINTER |
- FLAG_SKIP_COMP_BESTINTRA |
- FLAG_SKIP_INTRA_LOWVAR;
-
- sf->use_rd_breakout = 1;
- sf->adaptive_motion_search = 1;
- sf->adaptive_pred_filter_type = 2;
- sf->auto_mv_step_size = 1;
-
- sf->disable_filter_search_var_thresh = 100;
- sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-
- sf->auto_min_max_partition_size = 1;
- sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
- sf->adjust_partitioning_from_last_frame = 1;
- sf->last_partitioning_redo_frequency = 3;
-
- sf->use_uv_intra_rd_estimate = 1;
- sf->skip_encode_sb = 1;
- sf->use_lp32x32fdct = 1;
- sf->subpel_iters_per_step = 1;
- sf->use_fast_coef_updates = 2;
-
- sf->adaptive_rd_thresh = 4;
- sf->mode_skip_start = 6;
- }
- if (speed == 4) {
- sf->use_square_partition_only = 1;
- sf->tx_size_search_method = USE_LARGESTALL;
- sf->disable_split_mask = DISABLE_ALL_SPLIT;
-
- sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
- FLAG_SKIP_INTRA_BESTINTER |
- FLAG_SKIP_COMP_BESTINTRA |
- FLAG_SKIP_COMP_REFMISMATCH |
- FLAG_SKIP_INTRA_LOWVAR |
- FLAG_EARLY_TERMINATE;
-
- sf->use_rd_breakout = 1;
- sf->adaptive_motion_search = 1;
- sf->adaptive_pred_filter_type = 2;
- sf->auto_mv_step_size = 1;
-
- sf->disable_filter_search_var_thresh = 200;
- sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-
- sf->auto_min_max_partition_size = 1;
- sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
- sf->adjust_partitioning_from_last_frame = 1;
- sf->last_partitioning_redo_frequency = 3;
-
- sf->use_uv_intra_rd_estimate = 1;
- sf->skip_encode_sb = 1;
- sf->use_lp32x32fdct = 1;
- sf->subpel_iters_per_step = 1;
- sf->use_fast_coef_updates = 2;
-
- sf->adaptive_rd_thresh = 4;
- sf->mode_skip_start = 6;
-
- /* sf->intra_y_mode_mask = INTRA_DC_ONLY;
- sf->intra_uv_mode_mask = INTRA_DC_ONLY;
- sf->search_method = BIGDIA;
- sf->disable_split_var_thresh = 64;
- sf->disable_filter_search_var_thresh = 64; */
- }
- if (speed == 5) {
- sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
- sf->use_one_partition_size_always = 1;
- sf->always_this_block_size = BLOCK_16X16;
- sf->tx_size_search_method = frame_is_intra_only(cm) ?
- USE_FULL_RD : USE_LARGESTALL;
- sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
- FLAG_SKIP_INTRA_BESTINTER |
- FLAG_SKIP_COMP_BESTINTRA |
- FLAG_SKIP_COMP_REFMISMATCH |
- FLAG_SKIP_INTRA_LOWVAR |
- FLAG_EARLY_TERMINATE;
- sf->use_rd_breakout = 1;
- sf->use_lp32x32fdct = 1;
- sf->optimize_coefficients = 0;
- sf->auto_mv_step_size = 1;
- // sf->reduce_first_step_size = 1;
- // sf->reference_masking = 1;
-
- sf->disable_split_mask = DISABLE_ALL_SPLIT;
- sf->search_method = HEX;
- sf->subpel_iters_per_step = 1;
- sf->disable_split_var_thresh = 64;
- sf->disable_filter_search_var_thresh = 500;
- for (i = 0; i < TX_SIZES; i++) {
- sf->intra_y_mode_mask[i] = INTRA_DC_ONLY;
- sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
- }
- sf->use_fast_coef_updates = 2;
- sf->adaptive_rd_thresh = 4;
- sf->mode_skip_start = 6;
- }
+ set_good_speed_feature(cm, sf, speed);
+ break;
break;
case 2:
set_rt_speed_feature(cm, sf, speed);
@@ -1350,8 +1262,6 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
- // cpi->use_golden_frame_only = 0;
- // cpi->use_last_frame_only = 0;
cpi->refresh_golden_frame = 0;
cpi->refresh_last_frame = 1;
cm->refresh_frame_context = 1;
@@ -1392,6 +1302,12 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
else
cpi->oxcf.maximum_buffer_size = rescale(cpi->oxcf.maximum_buffer_size,
cpi->oxcf.target_bandwidth, 1000);
+ // Under a configuration change, where maximum_buffer_size may change,
+ // keep buffer level clipped to the maximum allowed buffer size.
+ if (cpi->rc.bits_off_target > cpi->oxcf.maximum_buffer_size) {
+ cpi->rc.bits_off_target = cpi->oxcf.maximum_buffer_size;
+ cpi->rc.buffer_level = cpi->rc.bits_off_target;
+ }
// Set up frame rate and related parameters rate control values.
vp9_new_framerate(cpi, cpi->oxcf.framerate);
@@ -1453,6 +1369,9 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
#endif
set_tile_limits(cpi);
+
+ cpi->ext_refresh_frame_flags_pending = 0;
+ cpi->ext_refresh_frame_context_pending = 0;
}
#define M_LOG2_E 0.693147180559945309417
@@ -2257,19 +2176,20 @@ int vp9_update_reference(VP9_PTR ptr, int ref_frame_flags) {
if (ref_frame_flags > 7)
return -1;
- cpi->refresh_golden_frame = 0;
- cpi->refresh_alt_ref_frame = 0;
- cpi->refresh_last_frame = 0;
+ cpi->ext_refresh_golden_frame = 0;
+ cpi->ext_refresh_alt_ref_frame = 0;
+ cpi->ext_refresh_last_frame = 0;
if (ref_frame_flags & VP9_LAST_FLAG)
- cpi->refresh_last_frame = 1;
+ cpi->ext_refresh_last_frame = 1;
if (ref_frame_flags & VP9_GOLD_FLAG)
- cpi->refresh_golden_frame = 1;
+ cpi->ext_refresh_golden_frame = 1;
if (ref_frame_flags & VP9_ALT_FLAG)
- cpi->refresh_alt_ref_frame = 1;
+ cpi->ext_refresh_alt_ref_frame = 1;
+ cpi->ext_refresh_frame_flags_pending = 1;
return 0;
}
@@ -2325,7 +2245,8 @@ int vp9_set_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,
}
int vp9_update_entropy(VP9_PTR comp, int update) {
- ((VP9_COMP *)comp)->common.refresh_frame_context = update;
+ ((VP9_COMP *)comp)->ext_refresh_frame_context = update;
+ ((VP9_COMP *)comp)->ext_refresh_frame_context_pending = 1;
return 0;
}
@@ -2975,6 +2896,23 @@ static void get_ref_frame_flags(VP9_COMP *cpi) {
cpi->ref_frame_flags &= ~VP9_ALT_FLAG;
}
+static void set_ext_overrides(VP9_COMP *cpi) {
+ // Overrides the defaults with the externally supplied values with
+ // vp9_update_reference() and vp9_update_entropy() calls
+ // Note: The overrides are valid only for the next frame passed
+ // to encode_frame_to_data_rate() function
+ if (cpi->ext_refresh_frame_context_pending) {
+ cpi->common.refresh_frame_context = cpi->ext_refresh_frame_context;
+ cpi->ext_refresh_frame_context_pending = 0;
+ }
+ if (cpi->ext_refresh_frame_flags_pending) {
+ cpi->refresh_last_frame = cpi->ext_refresh_last_frame;
+ cpi->refresh_golden_frame = cpi->ext_refresh_golden_frame;
+ cpi->refresh_alt_ref_frame = cpi->ext_refresh_alt_ref_frame;
+ cpi->ext_refresh_frame_flags_pending = 0;
+ }
+}
+
static void encode_frame_to_data_rate(VP9_COMP *cpi,
size_t *size,
uint8_t *dest,
@@ -2991,6 +2929,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
unsigned int max_mv_def = MIN(cpi->common.width, cpi->common.height);
struct segmentation *const seg = &cm->seg;
+ set_ext_overrides(cpi);
+
/* Scale the source buffer, if required. */
if (cm->mi_cols * 8 != cpi->un_scaled_source->y_width ||
cm->mi_rows * 8 != cpi->un_scaled_source->y_height) {
@@ -3166,6 +3106,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
cpi->ambient_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
}
+ // If the encoder forced a KEY_FRAME decision
if (cm->frame_type == KEY_FRAME)
cpi->refresh_last_frame = 1;
@@ -3318,7 +3259,11 @@ static void SvcEncode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
static void Pass0Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
unsigned int *frame_flags) {
- vp9_get_one_pass_params(cpi);
+ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
+ vp9_get_one_pass_cbr_params(cpi);
+ } else {
+ vp9_get_one_pass_params(cpi);
+ }
encode_frame_to_data_rate(cpi, size, dest, frame_flags);
}
@@ -3396,6 +3341,44 @@ int is_next_frame_arf(VP9_COMP *cpi) {
}
#endif
+void adjust_frame_rate(VP9_COMP *cpi) {
+ int64_t this_duration;
+ int step = 0;
+
+ if (cpi->source->ts_start == cpi->first_time_stamp_ever) {
+ this_duration = cpi->source->ts_end - cpi->source->ts_start;
+ step = 1;
+ } else {
+ int64_t last_duration = cpi->last_end_time_stamp_seen
+ - cpi->last_time_stamp_seen;
+
+ this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen;
+
+ // do a step update if the duration changes by 10%
+ if (last_duration)
+ step = (int)((this_duration - last_duration) * 10 / last_duration);
+ }
+
+ if (this_duration) {
+ if (step) {
+ vp9_new_framerate(cpi, 10000000.0 / this_duration);
+ } else {
+ // Average this frame's rate into the last second's average
+ // frame rate. If we haven't seen 1 second yet, then average
+ // over the whole interval seen.
+ const double interval = MIN((double)(cpi->source->ts_end
+ - cpi->first_time_stamp_ever), 10000000.0);
+ double avg_duration = 10000000.0 / cpi->oxcf.framerate;
+ avg_duration *= (interval - avg_duration + this_duration);
+ avg_duration /= interval;
+
+ vp9_new_framerate(cpi, 10000000.0 / avg_duration);
+ }
+ }
+ cpi->last_time_stamp_seen = cpi->source->ts_start;
+ cpi->last_end_time_stamp_seen = cpi->source->ts_end;
+}
+
int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
size_t *size, uint8_t *dest,
int64_t *time_stamp, int64_t *time_end, int flush) {
@@ -3415,6 +3398,13 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);
+ // Normal defaults
+ cm->reset_frame_context = 0;
+ cm->refresh_frame_context = 1;
+ cpi->refresh_last_frame = 1;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_alt_ref_frame = 0;
+
// Should we code an alternate reference frame.
if (cpi->oxcf.play_alternate && cpi->rc.source_alt_ref_pending) {
int frames_to_arf;
@@ -3425,7 +3415,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
if (cpi->multi_arf_enabled && (cpi->pass == 2))
frames_to_arf = (-cpi->frame_coding_order[cpi->sequence_number])
- - cpi->next_frame_in_order;
+ - cpi->next_frame_in_order;
else
#endif
frames_to_arf = cpi->rc.frames_till_gf_update_due;
@@ -3509,18 +3499,6 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
*time_end = cpi->source->ts_end;
*frame_flags = cpi->source->flags;
- // fprintf(fp_out, " Frame:%d", cm->current_video_frame);
-#if CONFIG_MULTIPLE_ARF
- if (cpi->multi_arf_enabled) {
- // fprintf(fp_out, " seq_no:%d this_frame_weight:%d",
- // cpi->sequence_number, cpi->this_frame_weight);
- } else {
- // fprintf(fp_out, "\n");
- }
-#else
- // fprintf(fp_out, "\n");
-#endif
-
#if CONFIG_MULTIPLE_ARF
if ((cm->frame_type != KEY_FRAME) && (cpi->pass == 2))
cpi->rc.source_alt_ref_pending = is_next_frame_arf(cpi);
@@ -3542,43 +3520,8 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
}
// adjust frame rates based on timestamps given
- if (!cpi->refresh_alt_ref_frame) {
- int64_t this_duration;
- int step = 0;
-
- if (cpi->source->ts_start == cpi->first_time_stamp_ever) {
- this_duration = cpi->source->ts_end - cpi->source->ts_start;
- step = 1;
- } else {
- int64_t last_duration = cpi->last_end_time_stamp_seen
- - cpi->last_time_stamp_seen;
-
- this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen;
-
- // do a step update if the duration changes by 10%
- if (last_duration)
- step = (int)((this_duration - last_duration) * 10 / last_duration);
- }
-
- if (this_duration) {
- if (step) {
- vp9_new_framerate(cpi, 10000000.0 / this_duration);
- } else {
- // Average this frame's rate into the last second's average
- // frame rate. If we haven't seen 1 second yet, then average
- // over the whole interval seen.
- const double interval = MIN((double)(cpi->source->ts_end
- - cpi->first_time_stamp_ever), 10000000.0);
- double avg_duration = 10000000.0 / cpi->oxcf.framerate;
- avg_duration *= (interval - avg_duration + this_duration);
- avg_duration /= interval;
-
- vp9_new_framerate(cpi, 10000000.0 / avg_duration);
- }
- }
-
- cpi->last_time_stamp_seen = cpi->source->ts_start;
- cpi->last_end_time_stamp_seen = cpi->source->ts_end;
+ if (cm->show_frame) {
+ adjust_frame_rate(cpi);
}
// start with a 0 size frame
@@ -3604,21 +3547,6 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
}
#endif
-#if 0 // CONFIG_MULTIPLE_ARF
- if (cpi->multi_arf_enabled) {
- fprintf(fp_out, " idx(%d, %d, %d, %d) active(%d, %d, %d)",
- cpi->lst_fb_idx, cpi->gld_fb_idx, cpi->alt_fb_idx, cm->new_fb_idx,
- cm->ref_frame_map[cpi->lst_fb_idx],
- cm->ref_frame_map[cpi->gld_fb_idx],
- cm->ref_frame_map[cpi->alt_fb_idx]);
- if (cpi->refresh_alt_ref_frame)
- fprintf(fp_out, " type:ARF");
- if (cpi->rc.is_src_frame_alt_ref)
- fprintf(fp_out, " type:OVERLAY[%d]", cpi->alt_fb_idx);
- fprintf(fp_out, "\n");
- }
-#endif
-
cm->frame_flags = *frame_flags;
// Reset the frame pointers to the current frame size
@@ -3669,15 +3597,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
}
if (*size > 0) {
- // if its a dropped frame honor the requests on subsequent frames
cpi->droppable = !frame_is_reference(cpi);
-
- // return to normal state
- cm->reset_frame_context = 0;
- cm->refresh_frame_context = 1;
- cpi->refresh_alt_ref_frame = 0;
- cpi->refresh_golden_frame = 0;
- cpi->refresh_last_frame = 1;
}
vpx_usec_timer_mark(&cmptimer);
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 946424534..a5be0f424 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -232,57 +232,185 @@ typedef enum {
} LAST_FRAME_PARTITION_METHOD;
typedef struct {
+ // This flag refers to whether or not to perform rd optimization.
int RD;
+
+ // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
SEARCH_METHODS search_method;
- int auto_filter;
+
+ // Recode_loop can be:
+ // 0 means we only encode a frame once
+ // 1 means we can re-encode based on bitrate constraints on any frame
+ // 2 means we can only recode gold, alt, and key frames.
int recode_loop;
+
+ // Subpel_search_method can only be subpel_tree which does a subpixel
+ // logarithmic search that keeps stepping at 1/2 pixel units until
+ // you stop getting a gain, and then goes on to 1/4 and repeats
+ // the same process. Along the way it skips many diagonals.
SUBPEL_SEARCH_METHODS subpel_search_method;
+
+ // Maximum number of steps in logarithmic subpel search before giving up.
int subpel_iters_per_step;
+
+ // Thresh_mult is used to set a threshold for the rd score. A higher value
+ // means that we will accept the best mode so far more often. This number
+ // is used in combination with the current block size, and thresh_freq_fact
+ // to pick a threshold.
int thresh_mult[MAX_MODES];
int thresh_mult_sub8x8[MAX_REFS];
+
+ // This parameter controls the number of steps we'll do in a diamond
+ // search.
int max_step_search_steps;
+
+ // This parameter controls which step in the n-step process we start at.
+ // It's changed adaptively based on circumstances.
int reduce_first_step_size;
+
+ // If this is set to 1, we limit the motion search range to 2 times the
+ // largest motion vector found in the last frame.
int auto_mv_step_size;
+
+ // Trellis (dynamic programming) optimization of quantized values (+1, 0).
int optimize_coefficients;
+
+ // Always set to 0. If on it enables 0 cost background transmission
+ // (except for the initial transmission of the segmentation). The feature is
+ // disabled because the addition of very large block sizes make the
+ // backgrounds very to cheap to encode, and the segmentation we have
+ // adds overhead.
int static_segmentation;
+
+ // If 1 we iterate finding a best reference for 2 ref frames together - via
+ // a log search that iterates 4 times (check around mv for last for best
+ // error of combined predictor then check around mv for alt). If 0 we
+ // we just use the best motion vector found for each frame by itself.
int comp_inter_joint_search_thresh;
+
+ // This variable is used to cap the maximum number of times we skip testing a
+ // mode to be evaluated. A high value means we will be faster.
int adaptive_rd_thresh;
+
+ // Enables skipping the reconstruction step (idct, recon) in the
+ // intermediate steps assuming the last frame didn't have too many intra
+ // blocks and the q is less than a threshold.
int skip_encode_sb;
int skip_encode_frame;
+
+ // This variable allows us to reuse the last frames partition choices
+ // (64x64 v 32x32 etc) for this frame. It can be set to only use the last
+ // frame as a starting point in low motion scenes or always use it. If set
+ // we use last partitioning_redo frequency to determine how often to redo
+ // the partitioning from scratch. Adjust_partitioning_from_last_frame
+ // enables us to adjust up or down one partitioning from the last frames
+ // partitioning.
LAST_FRAME_PARTITION_METHOD use_lastframe_partitioning;
+
+ // Determine which method we use to determine transform size. We can choose
+ // between options like full rd, largest for prediction size, largest
+ // for intra and model coefs for the rest.
TX_SIZE_SEARCH_METHOD tx_size_search_method;
+
+ // Low precision 32x32 fdct keeps everything in 16 bits and thus is less
+ // precise but significantly faster than the non lp version.
int use_lp32x32fdct;
+
+ // TODO(JBB): remove this as its no longer used.
+
+ // If set partition size will always be always_this_block_size.
int use_one_partition_size_always;
+
+ // Skip rectangular partition test when partition type none gives better
+ // rd than partition type split.
int less_rectangular_check;
+
+ // Disable testing non square partitions. (eg 16x32)
int use_square_partition_only;
+
+ // After looking at the first set of modes (set by index here), skip
+ // checking modes for reference frames that don't match the reference frame
+ // of the best so far.
int mode_skip_start;
+
+ // TODO(JBB): Remove this.
int reference_masking;
+
+ // Used in conjunction with use_one_partition_size_always.
BLOCK_SIZE always_this_block_size;
+
+ // Sets min and max partition sizes for this 64x64 region based on the
+ // same superblock in last encoded frame, and the left and above neighbor
+ // in this block.
int auto_min_max_partition_size;
+
+ // Min and max partition size we enable (block_size) as per auto
+ // min max, but also used by adjust partitioning, and pick_partitioning.
BLOCK_SIZE min_partition_size;
BLOCK_SIZE max_partition_size;
+
+ // Whether or not we allow partitions one smaller or one greater than the last
+ // frame's partitioning. Only used if use_lastframe_partitioning is set.
int adjust_partitioning_from_last_frame;
+
+ // How frequently we re do the partitioning from scratch. Only used if
+ // use_lastframe_partitioning is set.
int last_partitioning_redo_frequency;
+
+ // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable
+ // it always, to allow it for only Last frame and Intra, disable it for all
+ // inter modes or to enable it always.
int disable_split_mask;
+
+ // TODO(jbb): Remove this and everything that uses it. It's only valid if
+ // we were doing small to large partition checks. We currently do the
+ // reverse.
int using_small_partition_info;
+
// TODO(jingning): combine the related motion search speed features
+ // This allows us to use motion search at other sizes as a starting
+ // point for this motion search and limits the search range around it.
int adaptive_motion_search;
+
+ // Allows sub 8x8 modes to use the prediction filter that was determined
+ // best for 8x8 mode. If set to 0 we always re check all the filters for
+ // sizes less than 8x8, 1 means we check all filter modes if no 8x8 filter
+ // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected.
int adaptive_pred_filter_type;
// Implements various heuristics to skip searching modes
// The heuristics selected are based on flags
// defined in the MODE_SEARCH_SKIP_HEURISTICS enum
unsigned int mode_search_skip_flags;
+
// A source variance threshold below which the split mode is disabled
unsigned int disable_split_var_thresh;
+
// A source variance threshold below which filter search is disabled
// Choose a very large value (UINT_MAX) to use 8-tap always
unsigned int disable_filter_search_var_thresh;
+
+ // These bit masks allow you to enable or disable intra modes for each
+ // transform size separately.
int intra_y_mode_mask[TX_SIZES];
int intra_uv_mode_mask[TX_SIZES];
+
+ // This variable enables an early break out of mode testing if the model for
+ // rd built from the prediction signal indicates a value that's much
+ // higher than the best rd we've seen so far.
int use_rd_breakout;
+
+ // This enables us to use an estimate for intra rd based on dc mode rather
+ // than choosing an actual uv mode in the stage of encoding before the actual
+ // final encode.
int use_uv_intra_rd_estimate;
+
+ // This picks a loop filter strength by trying a small portion of the image
+ // with different values.
int use_fast_lpf_pick;
+
+ // This feature limits the number of coefficients updates we actually do
+ // by only looking at counts from 1/2 the bands.
int use_fast_coef_updates; // 0: 2-loop, 1: 1-loop, 2: 1-loop reduced
} SPEED_FEATURES;
@@ -400,6 +528,15 @@ typedef struct VP9_COMP {
int refresh_last_frame;
int refresh_golden_frame;
int refresh_alt_ref_frame;
+
+ int ext_refresh_frame_flags_pending;
+ int ext_refresh_last_frame;
+ int ext_refresh_golden_frame;
+ int ext_refresh_alt_ref_frame;
+
+ int ext_refresh_frame_context_pending;
+ int ext_refresh_frame_context;
+
YV12_BUFFER_CONFIG last_frame_uf;
TOKENEXTRA *tok;
@@ -596,7 +733,7 @@ typedef struct VP9_COMP {
int *mb_norm_activity_map;
int output_partition;
- /* force next frame to intra when kf_auto says so */
+ // Force next frame to intra when kf_auto says so.
int force_next_frame_intra;
int droppable;
@@ -638,7 +775,7 @@ typedef struct VP9_COMP {
int64_t mode_test_hits[BLOCK_SIZES];
#endif
- /* Y,U,V,(A) */
+ // Y,U,V,(A)
ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 31b0116a4..939a7f998 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -184,8 +184,6 @@ void vp9_setup_key_frame(VP9_COMP *cpi) {
vp9_setup_past_independence(cm);
- // interval before next GF
- cpi->rc.frames_till_gf_update_due = cpi->rc.baseline_gf_interval;
/* All buffers are implicitly updated on key frames. */
cpi->refresh_golden_frame = 1;
cpi->refresh_alt_ref_frame = 1;
@@ -432,10 +430,32 @@ static void calc_pframe_target_size(VP9_COMP *const cpi) {
}
}
+static double get_rate_correction_factor(const VP9_COMP *cpi) {
+ if (cpi->common.frame_type == KEY_FRAME) {
+ return cpi->rc.key_frame_rate_correction_factor;
+ } else {
+ if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
+ return cpi->rc.gf_rate_correction_factor;
+ else
+ return cpi->rc.rate_correction_factor;
+ }
+}
+
+static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
+ if (cpi->common.frame_type == KEY_FRAME) {
+ cpi->rc.key_frame_rate_correction_factor = factor;
+ } else {
+ if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
+ cpi->rc.gf_rate_correction_factor = factor;
+ else
+ cpi->rc.rate_correction_factor = factor;
+ }
+}
+
void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
const int q = cpi->common.base_qindex;
int correction_factor = 100;
- double rate_correction_factor;
+ double rate_correction_factor = get_rate_correction_factor(cpi);
double adjustment_limit;
int projected_size_based_on_q = 0;
@@ -443,15 +463,6 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
// Clear down mmx registers to allow floating point in what follows
vp9_clear_system_state(); // __asm emms;
- if (cpi->common.frame_type == KEY_FRAME) {
- rate_correction_factor = cpi->rc.key_frame_rate_correction_factor;
- } else {
- if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
- rate_correction_factor = cpi->rc.gf_rate_correction_factor;
- else
- rate_correction_factor = cpi->rc.rate_correction_factor;
- }
-
// Work out how big we would have expected the frame to be at this Q given
// the current correction factor.
// Stay in double to avoid int overflow when values are large
@@ -501,36 +512,16 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
rate_correction_factor = MIN_BPB_FACTOR;
}
- if (cpi->common.frame_type == KEY_FRAME) {
- cpi->rc.key_frame_rate_correction_factor = rate_correction_factor;
- } else {
- if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
- cpi->rc.gf_rate_correction_factor = rate_correction_factor;
- else
- cpi->rc.rate_correction_factor = rate_correction_factor;
- }
+ set_rate_correction_factor(cpi, rate_correction_factor);
}
int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
int active_best_quality, int active_worst_quality) {
int q = active_worst_quality;
-
- int i;
int last_error = INT_MAX;
- int target_bits_per_mb;
- int bits_per_mb_at_this_q;
- double correction_factor;
-
- // Select the appropriate correction factor based upon type of frame.
- if (cpi->common.frame_type == KEY_FRAME) {
- correction_factor = cpi->rc.key_frame_rate_correction_factor;
- } else {
- if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
- correction_factor = cpi->rc.gf_rate_correction_factor;
- else
- correction_factor = cpi->rc.rate_correction_factor;
- }
+ int i, target_bits_per_mb, bits_per_mb_at_this_q;
+ const double correction_factor = get_rate_correction_factor(cpi);
// Calculate required scaling factor based on target frame size and size of
// frame produced using previous Q.
@@ -855,7 +846,6 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
// Update the Golden frame usage counts.
if (cpi->refresh_golden_frame) {
// this frame refreshes means next frames don't unless specified by user
- cpi->refresh_golden_frame = 0;
cpi->rc.frames_since_golden = 0;
if (!cpi->rc.source_alt_ref_pending)
@@ -876,36 +866,35 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
VP9_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
// Update rate control heuristics
- cpi->rc.projected_frame_size = (bytes_used << 3);
+ rc->projected_frame_size = (bytes_used << 3);
// Post encode loop adjustment of Q prediction.
- vp9_rc_update_rate_correction_factors(
- cpi, (cpi->sf.recode_loop ||
+ vp9_rc_update_rate_correction_factors(cpi, (cpi->sf.recode_loop ||
cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) ? 2 : 0);
// Keep a record of last Q and ambient average Q.
if (cm->frame_type == KEY_FRAME) {
- cpi->rc.last_q[KEY_FRAME] = cm->base_qindex;
- cpi->rc.avg_frame_qindex[KEY_FRAME] =
- (2 + 3 * cpi->rc.avg_frame_qindex[KEY_FRAME] + cm->base_qindex) >> 2;
- } else if (!cpi->rc.is_src_frame_alt_ref &&
+ rc->last_q[KEY_FRAME] = cm->base_qindex;
+ rc->avg_frame_qindex[KEY_FRAME] = ROUND_POWER_OF_TWO(
+ 3 * rc->avg_frame_qindex[KEY_FRAME] + cm->base_qindex, 2);
+ } else if (!rc->is_src_frame_alt_ref &&
(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
- cpi->rc.last_q[2] = cm->base_qindex;
- cpi->rc.avg_frame_qindex[2] =
- (2 + 3 * cpi->rc.avg_frame_qindex[2] + cm->base_qindex) >> 2;
+ rc->last_q[2] = cm->base_qindex;
+ rc->avg_frame_qindex[2] = ROUND_POWER_OF_TWO(
+ 3 * rc->avg_frame_qindex[2] + cm->base_qindex, 2);
} else {
- cpi->rc.last_q[INTER_FRAME] = cm->base_qindex;
- cpi->rc.avg_frame_qindex[INTER_FRAME] =
- (2 + 3 * cpi->rc.avg_frame_qindex[INTER_FRAME] +
- cm->base_qindex) >> 2;
- cpi->rc.ni_frames++;
- cpi->rc.tot_q += vp9_convert_qindex_to_q(cm->base_qindex);
- cpi->rc.avg_q = cpi->rc.tot_q / (double)cpi->rc.ni_frames;
+ rc->last_q[INTER_FRAME] = cm->base_qindex;
+ rc->avg_frame_qindex[INTER_FRAME] = ROUND_POWER_OF_TWO(
+ 3 * rc->avg_frame_qindex[INTER_FRAME] + cm->base_qindex, 2);
+ rc->ni_frames++;
+ rc->tot_q += vp9_convert_qindex_to_q(cm->base_qindex);
+ rc->avg_q = rc->tot_q / (double)rc->ni_frames;
// Calculate the average Q for normal inter frames (not key or GFU frames).
- cpi->rc.ni_tot_qi += cm->base_qindex;
- cpi->rc.ni_av_qi = cpi->rc.ni_tot_qi / cpi->rc.ni_frames;
+ rc->ni_tot_qi += cm->base_qindex;
+ rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames;
}
// Keep record of last boosted (KF/KF/ARF) Q value.
@@ -913,38 +902,34 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
// If all mbs in this group are skipped only update if the Q value is
// better than that already stored.
// This is used to help set quality in forced key frames to reduce popping
- if ((cm->base_qindex < cpi->rc.last_boosted_qindex) ||
+ if ((cm->base_qindex < rc->last_boosted_qindex) ||
((cpi->static_mb_pct < 100) &&
((cm->frame_type == KEY_FRAME) || cpi->refresh_alt_ref_frame ||
- (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)))) {
- cpi->rc.last_boosted_qindex = cm->base_qindex;
+ (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) {
+ rc->last_boosted_qindex = cm->base_qindex;
}
- vp9_update_buffer_level(cpi, cpi->rc.projected_frame_size);
+ vp9_update_buffer_level(cpi, rc->projected_frame_size);
// Rolling monitors of whether we are over or underspending used to help
// regulate min and Max Q in two pass.
if (cm->frame_type != KEY_FRAME) {
- cpi->rc.rolling_target_bits =
- ((cpi->rc.rolling_target_bits * 3) +
- cpi->rc.this_frame_target + 2) / 4;
- cpi->rc.rolling_actual_bits =
- ((cpi->rc.rolling_actual_bits * 3) +
- cpi->rc.projected_frame_size + 2) / 4;
- cpi->rc.long_rolling_target_bits =
- ((cpi->rc.long_rolling_target_bits * 31) +
- cpi->rc.this_frame_target + 16) / 32;
- cpi->rc.long_rolling_actual_bits =
- ((cpi->rc.long_rolling_actual_bits * 31) +
- cpi->rc.projected_frame_size + 16) / 32;
+ rc->rolling_target_bits = ROUND_POWER_OF_TWO(
+ rc->rolling_target_bits * 3 + rc->this_frame_target, 2);
+ rc->rolling_actual_bits = ROUND_POWER_OF_TWO(
+ rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2);
+ rc->long_rolling_target_bits = ROUND_POWER_OF_TWO(
+ rc->long_rolling_target_bits * 31 + rc->this_frame_target, 5);
+ rc->long_rolling_actual_bits = ROUND_POWER_OF_TWO(
+ rc->long_rolling_actual_bits * 31 + rc->projected_frame_size, 5);
}
// Actual bits spent
- cpi->rc.total_actual_bits += cpi->rc.projected_frame_size;
+ rc->total_actual_bits += rc->projected_frame_size;
// Debug stats
- cpi->rc.total_target_vs_actual += (cpi->rc.this_frame_target -
- cpi->rc.projected_frame_size);
+ rc->total_target_vs_actual += (rc->this_frame_target -
+ rc->projected_frame_size);
#ifndef DISABLE_RC_LONG_TERM_MEM
// Update bits left to the kf and gf groups to account for overshoot or
@@ -962,8 +947,8 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
}
#endif
- if (cpi->oxcf.play_alternate && cpi->refresh_alt_ref_frame
- && (cm->frame_type != KEY_FRAME))
+ if (cpi->oxcf.play_alternate && cpi->refresh_alt_ref_frame &&
+ (cm->frame_type != KEY_FRAME))
// Update the alternate reference frame stats as appropriate.
update_alt_ref_frame_stats(cpi);
else
@@ -971,10 +956,10 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
update_golden_frame_stats(cpi);
if (cm->frame_type == KEY_FRAME)
- cpi->rc.frames_since_key = 0;
+ rc->frames_since_key = 0;
if (cm->show_frame) {
- cpi->rc.frames_since_key++;
- cpi->rc.frames_to_key--;
+ rc->frames_since_key++;
+ rc->frames_to_key--;
}
}
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 97dc1e0ff..81d47de92 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2124,10 +2124,10 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
cpi->common.show_frame &&
block_size < cpi->sf.max_partition_size);
- int_mv pred_mv[3] = {
- mbmi->ref_mvs[ref_frame][0], mbmi->ref_mvs[ref_frame][1],
- x->pred_mv[ref_frame]
- };
+ int_mv pred_mv[3];
+ pred_mv[0] = mbmi->ref_mvs[ref_frame][0];
+ pred_mv[1] = mbmi->ref_mvs[ref_frame][1];
+ pred_mv[2] = x->pred_mv[ref_frame];
// Get the sad for each candidate reference mv
for (i = 0; i < num_mv_refs; i++) {
@@ -2276,14 +2276,14 @@ static void setup_pred_block(const MACROBLOCKD *xd,
}
}
-static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
- const TileInfo *const tile,
- int idx, MV_REFERENCE_FRAME frame_type,
- BLOCK_SIZE block_size,
- int mi_row, int mi_col,
- int_mv frame_nearest_mv[MAX_REF_FRAMES],
- int_mv frame_near_mv[MAX_REF_FRAMES],
- struct buf_2d yv12_mb[4][MAX_MB_PLANE]) {
+void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
+ const TileInfo *const tile,
+ int idx, MV_REFERENCE_FRAME frame_type,
+ BLOCK_SIZE block_size,
+ int mi_row, int mi_col,
+ int_mv frame_nearest_mv[MAX_REF_FRAMES],
+ int_mv frame_near_mv[MAX_REF_FRAMES],
+ struct buf_2d yv12_mb[4][MAX_MB_PLANE]) {
VP9_COMMON *cm = &cpi->common;
YV12_BUFFER_CONFIG *yv12 = &cm->yv12_fb[cpi->common.ref_frame_map[idx]];
MACROBLOCKD *const xd = &x->e_mbd;
@@ -2355,9 +2355,10 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
YV12_BUFFER_CONFIG *scaled_ref_frame = get_scaled_ref_frame(cpi, ref);
- int_mv pred_mv[3] = {
- mbmi->ref_mvs[ref][0], mbmi->ref_mvs[ref][1], x->pred_mv[ref]
- };
+ int_mv pred_mv[3];
+ pred_mv[0] = mbmi->ref_mvs[ref][0];
+ pred_mv[1] = mbmi->ref_mvs[ref][1];
+ pred_mv[2] = x->pred_mv[ref];
if (scaled_ref_frame) {
int i;
@@ -3174,17 +3175,29 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
*returnrate = INT_MAX;
- for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
x->pred_mv_sad[ref_frame] = INT_MAX;
if (cpi->ref_frame_flags & flag_list[ref_frame]) {
- setup_buffer_inter(cpi, x, tile, get_ref_frame_idx(cpi, ref_frame),
- ref_frame, block_size, mi_row, mi_col,
- frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
+ vp9_setup_buffer_inter(cpi, x, tile, get_ref_frame_idx(cpi, ref_frame),
+ ref_frame, block_size, mi_row, mi_col,
+ frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
}
frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
frame_mv[ZEROMV][ref_frame].as_int = 0;
}
+ cpi->ref_frame_mask = 0;
+ for (ref_frame = LAST_FRAME;
+ ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) {
+ int i;
+ for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+ if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
+ cpi->ref_frame_mask |= (1 << ref_frame);
+ break;
+ }
+ }
+ }
+
for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
int mode_excluded = 0;
int64_t this_rd = INT64_MAX;
@@ -3234,8 +3247,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
}
// Skip if the current reference frame has been masked off
- if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask &&
- (cpi->ref_frame_mask & (1 << ref_frame)))
+ if (cpi->ref_frame_mask & (1 << ref_frame) && this_mode != NEWMV)
continue;
// Test best rd so far against threshold for trying this mode.
@@ -3640,11 +3652,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
}
}
- // If we are using reference masking and the set mask flag is set then
- // create the reference frame mask.
- if (cpi->sf.reference_masking && cpi->set_ref_frame_mask)
- cpi->ref_frame_mask = ~(1 << vp9_mode_order[best_mode_index].ref_frame[0]);
-
// Flag all modes that have a distortion thats > 2x the best we found at
// this level.
for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) {
@@ -3796,15 +3803,27 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
if (cpi->ref_frame_flags & flag_list[ref_frame]) {
- setup_buffer_inter(cpi, x, tile, get_ref_frame_idx(cpi, ref_frame),
- ref_frame, block_size, mi_row, mi_col,
- frame_mv[NEARESTMV], frame_mv[NEARMV],
- yv12_mb);
+ vp9_setup_buffer_inter(cpi, x, tile, get_ref_frame_idx(cpi, ref_frame),
+ ref_frame, block_size, mi_row, mi_col,
+ frame_mv[NEARESTMV], frame_mv[NEARMV],
+ yv12_mb);
}
frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
frame_mv[ZEROMV][ref_frame].as_int = 0;
}
+ cpi->ref_frame_mask = 0;
+ for (ref_frame = LAST_FRAME;
+ ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) {
+ int i;
+ for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+ if ((x->pred_mv_sad[ref_frame] >> 1) > x->pred_mv_sad[i]) {
+ cpi->ref_frame_mask |= (1 << ref_frame);
+ break;
+ }
+ }
+ }
+
for (mode_index = 0; mode_index < MAX_REFS; ++mode_index) {
int mode_excluded = 0;
int64_t this_rd = INT64_MAX;
@@ -3852,11 +3871,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
continue;
}
- // Skip if the current reference frame has been masked off
- if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask &&
- (cpi->ref_frame_mask & (1 << ref_frame)))
- continue;
-
// Test best rd so far against threshold for trying this mode.
if ((best_rd <
((int64_t)cpi->rd_thresh_sub8x8[segment_id][bsize][mode_index] *
@@ -4366,11 +4380,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
}
}
- // If we are using reference masking and the set mask flag is set then
- // create the reference frame mask.
- if (cpi->sf.reference_masking && cpi->set_ref_frame_mask)
- cpi->ref_frame_mask = ~(1 << vp9_ref_order[best_mode_index].ref_frame[0]);
-
if (best_rd == INT64_MAX && bsize < BLOCK_8X8) {
*returnrate = INT_MAX;
*returndistortion = INT_MAX;
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index f0e8849c1..5732c2b2d 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -27,6 +27,15 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi);
void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);
+void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
+ const TileInfo *const tile,
+ int idx, MV_REFERENCE_FRAME frame_type,
+ BLOCK_SIZE block_size,
+ int mi_row, int mi_col,
+ int_mv frame_nearest_mv[MAX_REF_FRAMES],
+ int_mv frame_near_mv[MAX_REF_FRAMES],
+ struct buf_2d yv12_mb[4][MAX_MB_PLANE]);
+
void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int *r, int64_t *d, BLOCK_SIZE bsize,
PICK_MODE_CONTEXT *ctx, int64_t best_rd);
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index bd1150b98..b1c029cba 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -74,7 +74,6 @@ VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c
ifeq ($(CONFIG_VP9_POSTPROC),yes)
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index a7d2e1d6a..a03f7befc 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -25,7 +25,7 @@ typedef vpx_codec_stream_info_t vp9_stream_info_t;
/* Structures for handling memory allocations */
typedef enum {
- VP9_SEG_ALG_PRIV = 256,
+ VP9_SEG_ALG_PRIV = 256,
VP9_SEG_MAX
} mem_seg_id_t;
#define NELEMENTS(x) ((int)(sizeof(x)/sizeof(x[0])))
@@ -107,12 +107,11 @@ static void vp9_finalize_mmaps(vpx_codec_alg_priv_t *ctx) {
static vpx_codec_err_t vp9_init(vpx_codec_ctx_t *ctx,
vpx_codec_priv_enc_mr_cfg_t *data) {
- vpx_codec_err_t res = VPX_CODEC_OK;
+ vpx_codec_err_t res = VPX_CODEC_OK;
- /* This function only allocates space for the vpx_codec_alg_priv_t
- * structure. More memory may be required at the time the stream
- * information becomes known.
- */
+ // This function only allocates space for the vpx_codec_alg_priv_t
+ // structure. More memory may be required at the time the stream
+ // information becomes known.
if (!ctx->priv) {
vpx_codec_mmap_t mmap;
@@ -122,12 +121,10 @@ static vpx_codec_err_t vp9_init(vpx_codec_ctx_t *ctx,
mmap.flags = vp9_mem_req_segs[0].flags;
res = vpx_mmap_alloc(&mmap);
-
if (!res) {
vp9_init_ctx(ctx, &mmap);
ctx->priv->alg_priv->defer_alloc = 1;
- /*post processing level initialized to do nothing */
}
}
@@ -147,8 +144,7 @@ static vpx_codec_err_t vp9_destroy(vpx_codec_alg_priv_t *ctx) {
return VPX_CODEC_OK;
}
-static vpx_codec_err_t vp9_peek_si(const uint8_t *data,
- unsigned int data_sz,
+static vpx_codec_err_t vp9_peek_si(const uint8_t *data, unsigned int data_sz,
vpx_codec_stream_info_t *si) {
if (data_sz <= 8) return VPX_CODEC_UNSUP_BITSTREAM;
if (data + data_sz <= data) return VPX_CODEC_INVALID_PARAM;
@@ -213,13 +209,9 @@ static vpx_codec_err_t vp9_peek_si(const uint8_t *data,
static vpx_codec_err_t vp9_get_si(vpx_codec_alg_priv_t *ctx,
vpx_codec_stream_info_t *si) {
- unsigned int sz;
-
- if (si->sz >= sizeof(vp9_stream_info_t))
- sz = sizeof(vp9_stream_info_t);
- else
- sz = sizeof(vpx_codec_stream_info_t);
-
+ const size_t sz = (si->sz >= sizeof(vp9_stream_info_t))
+ ? sizeof(vp9_stream_info_t)
+ : sizeof(vpx_codec_stream_info_t);
memcpy(si, &ctx->si, sz);
si->sz = sz;
@@ -227,24 +219,17 @@ static vpx_codec_err_t vp9_get_si(vpx_codec_alg_priv_t *ctx,
}
-static vpx_codec_err_t
-update_error_state(vpx_codec_alg_priv_t *ctx,
- const struct vpx_internal_error_info *error) {
- vpx_codec_err_t res;
-
- if ((res = error->error_code))
- ctx->base.err_detail = error->has_detail
- ? error->detail
- : NULL;
+static vpx_codec_err_t update_error_state(vpx_codec_alg_priv_t *ctx,
+ const struct vpx_internal_error_info *error) {
+ if (error->error_code)
+ ctx->base.err_detail = error->has_detail ? error->detail : NULL;
- return res;
+ return error->error_code;
}
-static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
- const uint8_t **data,
- unsigned int data_sz,
- void *user_priv,
- long deadline) {
+static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
+ const uint8_t **data, unsigned int data_sz,
+ void *user_priv, int64_t deadline) {
vpx_codec_err_t res = VPX_CODEC_OK;
ctx->img_avail = 0;
@@ -304,13 +289,11 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
oxcf.inv_tile_order = ctx->invert_tile_order;
optr = vp9_create_decompressor(&oxcf);
- /* If postprocessing was enabled by the application and a
- * configuration has not been provided, default it.
- */
- if (!ctx->postproc_cfg_set
- && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) {
- ctx->postproc_cfg.post_proc_flag =
- VP8_DEBLOCK | VP8_DEMACROBLOCK;
+ // If postprocessing was enabled by the application and a
+ // configuration has not been provided, default it.
+ if (!ctx->postproc_cfg_set &&
+ (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) {
+ ctx->postproc_cfg.post_proc_flag = VP8_DEBLOCK | VP8_DEMACROBLOCK;
ctx->postproc_cfg.deblocking_level = 4;
ctx->postproc_cfg.noise_level = 0;
}
@@ -354,25 +337,20 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) {
flags.post_proc_flag =
#if CONFIG_POSTPROC_VISUALIZER
- ((ctx->dbg_color_ref_frame_flag != 0) ?
- VP9D_DEBUG_CLR_FRM_REF_BLKS : 0)
- | ((ctx->dbg_color_mb_modes_flag != 0) ?
- VP9D_DEBUG_CLR_BLK_MODES : 0)
- | ((ctx->dbg_color_b_modes_flag != 0) ?
- VP9D_DEBUG_CLR_BLK_MODES : 0)
- | ((ctx->dbg_display_mv_flag != 0) ?
- VP9D_DEBUG_DRAW_MV : 0)
- |
+ (ctx->dbg_color_ref_frame_flag ? VP9D_DEBUG_CLR_FRM_REF_BLKS : 0) |
+ (ctx->dbg_color_mb_modes_flag ? VP9D_DEBUG_CLR_BLK_MODES : 0) |
+ (ctx->dbg_color_b_modes_flag ? VP9D_DEBUG_CLR_BLK_MODES : 0) |
+ (ctx->dbg_display_mv_flag ? VP9D_DEBUG_DRAW_MV : 0) |
#endif
ctx->postproc_cfg.post_proc_flag;
- flags.deblocking_level = ctx->postproc_cfg.deblocking_level;
- flags.noise_level = ctx->postproc_cfg.noise_level;
+ flags.deblocking_level = ctx->postproc_cfg.deblocking_level;
+ flags.noise_level = ctx->postproc_cfg.noise_level;
#if CONFIG_POSTPROC_VISUALIZER
flags.display_ref_frame_flag = ctx->dbg_color_ref_frame_flag;
flags.display_mb_modes_flag = ctx->dbg_color_mb_modes_flag;
- flags.display_b_modes_flag = ctx->dbg_color_b_modes_flag;
- flags.display_mv_flag = ctx->dbg_display_mv_flag;
+ flags.display_b_modes_flag = ctx->dbg_color_b_modes_flag;
+ flags.display_mv_flag = ctx->dbg_display_mv_flag;
#endif
}
@@ -391,10 +369,8 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
return res;
}
-static void parse_superframe_index(const uint8_t *data,
- size_t data_sz,
- uint32_t sizes[8],
- int *count) {
+static void parse_superframe_index(const uint8_t *data, size_t data_sz,
+ uint32_t sizes[8], int *count) {
uint8_t marker;
assert(data_sz);
@@ -527,11 +503,11 @@ static vpx_codec_err_t vp9_set_frame_buffers(
return VPX_CODEC_ERROR;
}
-static vpx_codec_err_t vp9_xma_get_mmap(const vpx_codec_ctx_t *ctx,
- vpx_codec_mmap_t *mmap,
- vpx_codec_iter_t *iter) {
- vpx_codec_err_t res;
- const mem_req_t *seg_iter = *iter;
+static vpx_codec_err_t vp9_xma_get_mmap(const vpx_codec_ctx_t *ctx,
+ vpx_codec_mmap_t *mmap,
+ vpx_codec_iter_t *iter) {
+ vpx_codec_err_t res;
+ const mem_req_t *seg_iter = *iter;
/* Get address of next segment request */
do {
@@ -560,7 +536,7 @@ static vpx_codec_err_t vp9_xma_get_mmap(const vpx_codec_ctx_t *ctx,
return res;
}
-static vpx_codec_err_t vp9_xma_set_mmap(vpx_codec_ctx_t *ctx,
+static vpx_codec_err_t vp9_xma_set_mmap(vpx_codec_ctx_t *ctx,
const vpx_codec_mmap_t *mmap) {
vpx_codec_err_t res = VPX_CODEC_MEM_ERROR;
int i, done;
@@ -596,8 +572,7 @@ static vpx_codec_err_t vp9_xma_set_mmap(vpx_codec_ctx_t *ctx,
return res;
}
-static vpx_codec_err_t set_reference(vpx_codec_alg_priv_t *ctx,
- int ctr_id,
+static vpx_codec_err_t set_reference(vpx_codec_alg_priv_t *ctx, int ctr_id,
va_list args) {
vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
@@ -606,7 +581,6 @@ static vpx_codec_err_t set_reference(vpx_codec_alg_priv_t *ctx,
YV12_BUFFER_CONFIG sd;
image2yuvconfig(&frame->img, &sd);
-
return vp9_set_reference_dec(ctx->pbi,
(VP9_REFFRAME)frame->frame_type, &sd);
} else {
@@ -614,8 +588,7 @@ static vpx_codec_err_t set_reference(vpx_codec_alg_priv_t *ctx,
}
}
-static vpx_codec_err_t copy_reference(vpx_codec_alg_priv_t *ctx,
- int ctr_id,
+static vpx_codec_err_t copy_reference(vpx_codec_alg_priv_t *ctx, int ctr_id,
va_list args) {
vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
@@ -632,8 +605,7 @@ static vpx_codec_err_t copy_reference(vpx_codec_alg_priv_t *ctx,
}
}
-static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx,
- int ctr_id,
+static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx, int ctr_id,
va_list args) {
vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *);
@@ -648,8 +620,7 @@ static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx,
}
}
-static vpx_codec_err_t set_postproc(vpx_codec_alg_priv_t *ctx,
- int ctr_id,
+static vpx_codec_err_t set_postproc(vpx_codec_alg_priv_t *ctx, int ctr_id,
va_list args) {
#if CONFIG_VP9_POSTPROC
vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
@@ -666,8 +637,7 @@ static vpx_codec_err_t set_postproc(vpx_codec_alg_priv_t *ctx,
#endif
}
-static vpx_codec_err_t set_dbg_options(vpx_codec_alg_priv_t *ctx,
- int ctrl_id,
+static vpx_codec_err_t set_dbg_options(vpx_codec_alg_priv_t *ctx, int ctrl_id,
va_list args) {
#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
int data = va_arg(args, int);
@@ -688,8 +658,7 @@ static vpx_codec_err_t set_dbg_options(vpx_codec_alg_priv_t *ctx,
}
static vpx_codec_err_t get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
- int ctrl_id,
- va_list args) {
+ int ctrl_id, va_list args) {
int *update_info = va_arg(args, int *);
VP9D_COMP *pbi = (VP9D_COMP*)ctx->pbi;
@@ -704,8 +673,7 @@ static vpx_codec_err_t get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
static vpx_codec_err_t get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
- int ctrl_id,
- va_list args) {
+ int ctrl_id, va_list args) {
int *corrupted = va_arg(args, int *);
if (corrupted) {
@@ -721,8 +689,7 @@ static vpx_codec_err_t get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
}
static vpx_codec_err_t get_display_size(vpx_codec_alg_priv_t *ctx,
- int ctrl_id,
- va_list args) {
+ int ctrl_id, va_list args) {
int *const display_size = va_arg(args, int *);
if (display_size) {