diff options
45 files changed, 987 insertions, 456 deletions
@@ -35,6 +35,9 @@ Advanced options: ${toggle_debug_libs} in/exclude debug version of libraries ${toggle_static_msvcrt} use static MSVCRT (VS builds only) ${toggle_vp9_highbitdepth} use VP9 high bit depth (10/12) profiles + ${toggle_better_hw_compatibility} + enable encoder to produce streams with better + hardware decoder compatibility ${toggle_vp8} VP8 codec support ${toggle_vp9} VP9 codec support ${toggle_vp10} VP10 codec support @@ -320,6 +323,7 @@ CONFIG_LIST=" vp9_temporal_denoising coefficient_range_checking vp9_highbitdepth + better_hw_compatibility experimental size_limit ${EXPERIMENT_LIST} @@ -378,6 +382,7 @@ CMDLINE_SELECT=" temporal_denoising vp9_temporal_denoising coefficient_range_checking + better_hw_compatibility vp9_highbitdepth experimental " diff --git a/test/sad_test.cc b/test/sad_test.cc index 9dec3cb49..a144cfce7 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -702,18 +702,6 @@ INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests)); #if HAVE_SSE #if CONFIG_USE_X86INC -const SadMxNParam sse_tests[] = { - make_tuple(4, 8, &vpx_sad4x8_sse, -1), - make_tuple(4, 4, &vpx_sad4x4_sse, -1), -}; -INSTANTIATE_TEST_CASE_P(SSE, SADTest, ::testing::ValuesIn(sse_tests)); - -const SadMxNAvgParam avg_sse_tests[] = { - make_tuple(4, 8, &vpx_sad4x8_avg_sse, -1), - make_tuple(4, 4, &vpx_sad4x4_avg_sse, -1), -}; -INSTANTIATE_TEST_CASE_P(SSE, SADavgTest, ::testing::ValuesIn(avg_sse_tests)); - const SadMxNx4Param x4d_sse_tests[] = { make_tuple(4, 8, &vpx_sad4x8x4d_sse, -1), make_tuple(4, 4, &vpx_sad4x4x4d_sse, -1), @@ -736,6 +724,8 @@ const SadMxNParam sse2_tests[] = { make_tuple(8, 16, &vpx_sad8x16_sse2, -1), make_tuple(8, 8, &vpx_sad8x8_sse2, -1), make_tuple(8, 4, &vpx_sad8x4_sse2, -1), + make_tuple(4, 8, &vpx_sad4x8_sse2, -1), + make_tuple(4, 4, &vpx_sad4x4_sse2, -1), #if CONFIG_VP9_HIGHBITDEPTH make_tuple(64, 64, &vpx_highbd_sad64x64_sse2, 8), make_tuple(64, 32, &vpx_highbd_sad64x32_sse2, 8), @@ -786,6 +776,8 @@ const SadMxNAvgParam avg_sse2_tests[] = { make_tuple(8, 16, &vpx_sad8x16_avg_sse2, -1), make_tuple(8, 8, &vpx_sad8x8_avg_sse2, -1), make_tuple(8, 4, &vpx_sad8x4_avg_sse2, -1), + make_tuple(4, 8, &vpx_sad4x8_avg_sse2, -1), + make_tuple(4, 4, &vpx_sad4x4_avg_sse2, -1), #if CONFIG_VP9_HIGHBITDEPTH make_tuple(64, 64, &vpx_highbd_sad64x64_avg_sse2, 8), make_tuple(64, 32, &vpx_highbd_sad64x32_avg_sse2, 8), diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index ad3327e2d..e6198afbd 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -132,7 +132,6 @@ using std::tr1::make_tuple; #if HAVE_SSE2 #if CONFIG_VP9_HIGHBITDEPTH #if CONFIG_USE_X86INC -#if ARCH_X86_64 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest, ::testing::Values( make_tuple(&vpx_highbd_dc_predictor_32x32_sse2, @@ -141,13 +140,13 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest, &vpx_highbd_tm_predictor_16x16_c, 16, 8), make_tuple(&vpx_highbd_tm_predictor_32x32_sse2, &vpx_highbd_tm_predictor_32x32_c, 32, 8), - make_tuple(&vpx_highbd_dc_predictor_4x4_sse, + make_tuple(&vpx_highbd_dc_predictor_4x4_sse2, &vpx_highbd_dc_predictor_4x4_c, 4, 8), make_tuple(&vpx_highbd_dc_predictor_8x8_sse2, &vpx_highbd_dc_predictor_8x8_c, 8, 8), make_tuple(&vpx_highbd_dc_predictor_16x16_sse2, &vpx_highbd_dc_predictor_16x16_c, 16, 8), - make_tuple(&vpx_highbd_v_predictor_4x4_sse, + make_tuple(&vpx_highbd_v_predictor_4x4_sse2, &vpx_highbd_v_predictor_4x4_c, 4, 8), make_tuple(&vpx_highbd_v_predictor_8x8_sse2, &vpx_highbd_v_predictor_8x8_c, 8, 8), @@ -155,34 +154,11 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest, &vpx_highbd_v_predictor_16x16_c, 16, 8), make_tuple(&vpx_highbd_v_predictor_32x32_sse2, &vpx_highbd_v_predictor_32x32_c, 32, 8), - make_tuple(&vpx_highbd_tm_predictor_4x4_sse, + make_tuple(&vpx_highbd_tm_predictor_4x4_sse2, &vpx_highbd_tm_predictor_4x4_c, 4, 8), make_tuple(&vpx_highbd_tm_predictor_8x8_sse2, &vpx_highbd_tm_predictor_8x8_c, 8, 8))); -#else -INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest, - ::testing::Values( - make_tuple(&vpx_highbd_dc_predictor_4x4_sse, - &vpx_highbd_dc_predictor_4x4_c, 4, 8), - make_tuple(&vpx_highbd_dc_predictor_8x8_sse2, - &vpx_highbd_dc_predictor_8x8_c, 8, 8), - make_tuple(&vpx_highbd_dc_predictor_16x16_sse2, - &vpx_highbd_dc_predictor_16x16_c, 16, 8), - make_tuple(&vpx_highbd_v_predictor_4x4_sse, - &vpx_highbd_v_predictor_4x4_c, 4, 8), - make_tuple(&vpx_highbd_v_predictor_8x8_sse2, - &vpx_highbd_v_predictor_8x8_c, 8, 8), - make_tuple(&vpx_highbd_v_predictor_16x16_sse2, - &vpx_highbd_v_predictor_16x16_c, 16, 8), - make_tuple(&vpx_highbd_v_predictor_32x32_sse2, - &vpx_highbd_v_predictor_32x32_c, 32, 8), - make_tuple(&vpx_highbd_tm_predictor_4x4_sse, - &vpx_highbd_tm_predictor_4x4_c, 4, 8), - make_tuple(&vpx_highbd_tm_predictor_8x8_sse2, - &vpx_highbd_tm_predictor_8x8_c, 8, 8))); -#endif // !ARCH_X86_64 -#if ARCH_X86_64 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest, ::testing::Values( make_tuple(&vpx_highbd_dc_predictor_32x32_sse2, @@ -194,14 +170,14 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest, make_tuple(&vpx_highbd_tm_predictor_32x32_sse2, &vpx_highbd_tm_predictor_32x32_c, 32, 10), - make_tuple(&vpx_highbd_dc_predictor_4x4_sse, + make_tuple(&vpx_highbd_dc_predictor_4x4_sse2, &vpx_highbd_dc_predictor_4x4_c, 4, 10), make_tuple(&vpx_highbd_dc_predictor_8x8_sse2, &vpx_highbd_dc_predictor_8x8_c, 8, 10), make_tuple(&vpx_highbd_dc_predictor_16x16_sse2, &vpx_highbd_dc_predictor_16x16_c, 16, 10), - make_tuple(&vpx_highbd_v_predictor_4x4_sse, + make_tuple(&vpx_highbd_v_predictor_4x4_sse2, &vpx_highbd_v_predictor_4x4_c, 4, 10), make_tuple(&vpx_highbd_v_predictor_8x8_sse2, &vpx_highbd_v_predictor_8x8_c, 8, 10), @@ -211,35 +187,11 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest, make_tuple(&vpx_highbd_v_predictor_32x32_sse2, &vpx_highbd_v_predictor_32x32_c, 32, 10), - make_tuple(&vpx_highbd_tm_predictor_4x4_sse, - &vpx_highbd_tm_predictor_4x4_c, 4, 10), - make_tuple(&vpx_highbd_tm_predictor_8x8_sse2, - &vpx_highbd_tm_predictor_8x8_c, 8, 10))); -#else -INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest, - ::testing::Values( - make_tuple(&vpx_highbd_dc_predictor_4x4_sse, - &vpx_highbd_dc_predictor_4x4_c, 4, 10), - make_tuple(&vpx_highbd_dc_predictor_8x8_sse2, - &vpx_highbd_dc_predictor_8x8_c, 8, 10), - make_tuple(&vpx_highbd_dc_predictor_16x16_sse2, - &vpx_highbd_dc_predictor_16x16_c, 16, - 10), - make_tuple(&vpx_highbd_v_predictor_4x4_sse, - &vpx_highbd_v_predictor_4x4_c, 4, 10), - make_tuple(&vpx_highbd_v_predictor_8x8_sse2, - &vpx_highbd_v_predictor_8x8_c, 8, 10), - make_tuple(&vpx_highbd_v_predictor_16x16_sse2, - &vpx_highbd_v_predictor_16x16_c, 16, 10), - make_tuple(&vpx_highbd_v_predictor_32x32_sse2, - &vpx_highbd_v_predictor_32x32_c, 32, 10), - make_tuple(&vpx_highbd_tm_predictor_4x4_sse, + make_tuple(&vpx_highbd_tm_predictor_4x4_sse2, &vpx_highbd_tm_predictor_4x4_c, 4, 10), make_tuple(&vpx_highbd_tm_predictor_8x8_sse2, &vpx_highbd_tm_predictor_8x8_c, 8, 10))); -#endif // !ARCH_X86_64 -#if ARCH_X86_64 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest, ::testing::Values( make_tuple(&vpx_highbd_dc_predictor_32x32_sse2, @@ -251,14 +203,14 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest, make_tuple(&vpx_highbd_tm_predictor_32x32_sse2, &vpx_highbd_tm_predictor_32x32_c, 32, 12), - make_tuple(&vpx_highbd_dc_predictor_4x4_sse, + make_tuple(&vpx_highbd_dc_predictor_4x4_sse2, &vpx_highbd_dc_predictor_4x4_c, 4, 12), make_tuple(&vpx_highbd_dc_predictor_8x8_sse2, &vpx_highbd_dc_predictor_8x8_c, 8, 12), make_tuple(&vpx_highbd_dc_predictor_16x16_sse2, &vpx_highbd_dc_predictor_16x16_c, 16, 12), - make_tuple(&vpx_highbd_v_predictor_4x4_sse, + make_tuple(&vpx_highbd_v_predictor_4x4_sse2, &vpx_highbd_v_predictor_4x4_c, 4, 12), make_tuple(&vpx_highbd_v_predictor_8x8_sse2, &vpx_highbd_v_predictor_8x8_c, 8, 12), @@ -268,33 +220,11 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest, make_tuple(&vpx_highbd_v_predictor_32x32_sse2, &vpx_highbd_v_predictor_32x32_c, 32, 12), - make_tuple(&vpx_highbd_tm_predictor_4x4_sse, + make_tuple(&vpx_highbd_tm_predictor_4x4_sse2, &vpx_highbd_tm_predictor_4x4_c, 4, 12), make_tuple(&vpx_highbd_tm_predictor_8x8_sse2, &vpx_highbd_tm_predictor_8x8_c, 8, 12))); -#else -INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest, - ::testing::Values( - make_tuple(&vpx_highbd_dc_predictor_4x4_sse, - &vpx_highbd_dc_predictor_4x4_c, 4, 12), - make_tuple(&vpx_highbd_dc_predictor_8x8_sse2, - &vpx_highbd_dc_predictor_8x8_c, 8, 12), - make_tuple(&vpx_highbd_dc_predictor_16x16_sse2, - &vpx_highbd_dc_predictor_16x16_c, 16, - 12), - make_tuple(&vpx_highbd_v_predictor_4x4_sse, - &vpx_highbd_v_predictor_4x4_c, 4, 12), - make_tuple(&vpx_highbd_v_predictor_8x8_sse2, - &vpx_highbd_v_predictor_8x8_c, 8, 12), - make_tuple(&vpx_highbd_v_predictor_16x16_sse2, - &vpx_highbd_v_predictor_16x16_c, 16, 12), - make_tuple(&vpx_highbd_v_predictor_32x32_sse2, - &vpx_highbd_v_predictor_32x32_c, 32, 12), - make_tuple(&vpx_highbd_tm_predictor_4x4_sse, - &vpx_highbd_tm_predictor_4x4_c, 4, 12), - make_tuple(&vpx_highbd_tm_predictor_8x8_sse2, - &vpx_highbd_tm_predictor_8x8_c, 8, 12))); -#endif // !ARCH_X86_64 + #endif // CONFIG_USE_X86INC #endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_SSE2 diff --git a/vp10/common/blockd.h b/vp10/common/blockd.h index 2f10378a6..fce176796 100644 --- a/vp10/common/blockd.h +++ b/vp10/common/blockd.h @@ -82,6 +82,7 @@ typedef struct { // Only for INTER blocks INTERP_FILTER interp_filter; MV_REFERENCE_FRAME ref_frame[2]; + TX_TYPE tx_type; // TODO(slavarnway): Delete and use bmi[3].as_mv[] instead. int_mv mv[2]; @@ -207,7 +208,7 @@ static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize, return subsize_lookup[partition][bsize]; } -static const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES] = { +static const TX_TYPE intra_mode_to_tx_type_context[INTRA_MODES] = { DCT_DCT, // DC ADST_DCT, // V DCT_ADST, // H @@ -225,11 +226,12 @@ static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type, const MACROBLOCKD *xd, const MODE_INFO *const mi = xd->mi[0]; const MB_MODE_INFO *const mbmi = &mi->mbmi; + (void) block_idx; if (plane_type != PLANE_TYPE_Y || xd->lossless[mbmi->segment_id] || - is_inter_block(mbmi) || mbmi->tx_size >= TX_32X32) + mbmi->tx_size >= TX_32X32) return DCT_DCT; - return intra_mode_to_tx_type_lookup[get_y_mode(mi, block_idx)]; + return mbmi->tx_type; } void vp10_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y); diff --git a/vp10/common/entropy.h b/vp10/common/entropy.h index 2f93cb31c..9a471c818 100644 --- a/vp10/common/entropy.h +++ b/vp10/common/entropy.h @@ -21,7 +21,8 @@ extern "C" { #endif -#define DIFF_UPDATE_PROB 252 +#define DIFF_UPDATE_PROB 252 +#define GROUP_DIFF_UPDATE_PROB 252 // Coefficient token alphabet #define ZERO_TOKEN 0 // 0 Extra Bits 0+0 diff --git a/vp10/common/entropymode.c b/vp10/common/entropymode.c index 2bb292a6b..78f3650f8 100644 --- a/vp10/common/entropymode.c +++ b/vp10/common/entropymode.c @@ -326,6 +326,26 @@ static const struct segmentation_probs default_seg_probs = { }; #endif +const vpx_tree_index vp10_ext_tx_tree[TREE_SIZE(TX_TYPES)] = { + -DCT_DCT, 2, + -ADST_ADST, 4, + -ADST_DCT, -DCT_ADST +}; + +static const vpx_prob default_intra_ext_tx_prob[EXT_TX_SIZES] + [TX_TYPES][TX_TYPES - 1] = { + {{240, 85, 128}, {4, 1, 248}, {4, 1, 8}, {4, 248, 128}}, + {{244, 85, 128}, {8, 2, 248}, {8, 2, 8}, {8, 248, 128}}, + {{248, 85, 128}, {16, 4, 248}, {16, 4, 8}, {16, 248, 128}}, +}; + +static const vpx_prob default_inter_ext_tx_prob[EXT_TX_SIZES] + [TX_TYPES - 1] = { + {160, 85, 128}, + {176, 85, 128}, + {192, 85, 128}, +}; + static void init_mode_probs(FRAME_CONTEXT *fc) { vp10_copy(fc->uv_mode_prob, default_uv_probs); vp10_copy(fc->y_mode_prob, default_if_y_probs); @@ -342,6 +362,8 @@ static void init_mode_probs(FRAME_CONTEXT *fc) { vp10_copy(fc->seg.tree_probs, default_seg_probs.tree_probs); vp10_copy(fc->seg.pred_probs, default_seg_probs.pred_probs); #endif + vp10_copy(fc->intra_ext_tx_prob, default_intra_ext_tx_prob); + vp10_copy(fc->inter_ext_tx_prob, default_inter_ext_tx_prob); } const vpx_tree_index vp10_switchable_interp_tree @@ -431,6 +453,21 @@ void vp10_adapt_intra_frame_probs(VP10_COMMON *cm) { fc->skip_probs[i] = mode_mv_merge_probs( pre_fc->skip_probs[i], counts->skip[i]); + for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { + int j; + for (j = 0; j < TX_TYPES; ++j) + vpx_tree_merge_probs(vp10_ext_tx_tree, + pre_fc->intra_ext_tx_prob[i][j], + counts->intra_ext_tx[i][j], + fc->intra_ext_tx_prob[i][j]); + } + for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { + vpx_tree_merge_probs(vp10_ext_tx_tree, + pre_fc->inter_ext_tx_prob[i], + counts->inter_ext_tx[i], + fc->inter_ext_tx_prob[i]); + } + #if CONFIG_MISC_FIXES if (cm->seg.temporal_update) { for (i = 0; i < PREDICTION_PROBS; i++) diff --git a/vp10/common/entropymode.h b/vp10/common/entropymode.h index 42fd9207f..611d3ad13 100644 --- a/vp10/common/entropymode.h +++ b/vp10/common/entropymode.h @@ -66,6 +66,8 @@ typedef struct frame_contexts { #if CONFIG_MISC_FIXES struct segmentation_probs seg; #endif + vpx_prob intra_ext_tx_prob[EXT_TX_SIZES][TX_TYPES][TX_TYPES - 1]; + vpx_prob inter_ext_tx_prob[EXT_TX_SIZES][TX_TYPES - 1]; int initialized; } FRAME_CONTEXT; @@ -90,6 +92,8 @@ typedef struct FRAME_COUNTS { #if CONFIG_MISC_FIXES struct seg_counts seg; #endif + unsigned int intra_ext_tx[EXT_TX_SIZES][TX_TYPES][TX_TYPES]; + unsigned int inter_ext_tx[EXT_TX_SIZES][TX_TYPES]; } FRAME_COUNTS; extern const vpx_prob vp10_kf_y_mode_prob[INTRA_MODES][INTRA_MODES] @@ -119,6 +123,9 @@ void vp10_tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p, void vp10_tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p, unsigned int (*ct_8x8p)[2]); +extern const vpx_tree_index + vp10_ext_tx_tree[TREE_SIZE(TX_TYPES)]; + static INLINE int vp10_ceil_log2(int n) { int i = 1, p = 2; while (p < n) { diff --git a/vp10/common/enums.h b/vp10/common/enums.h index a226a2d69..18c7d1629 100644 --- a/vp10/common/enums.h +++ b/vp10/common/enums.h @@ -97,6 +97,8 @@ typedef enum { TX_TYPES = 4 } TX_TYPE; +#define EXT_TX_SIZES 3 // number of sizes that use extended transforms + typedef enum { VP9_LAST_FLAG = 1 << 0, VP9_GOLD_FLAG = 1 << 1, diff --git a/vp10/common/thread_common.c b/vp10/common/thread_common.c index e83cb8e67..0c7a1c22a 100644 --- a/vp10/common/thread_common.c +++ b/vp10/common/thread_common.c @@ -435,6 +435,17 @@ void vp10_accumulate_frame_counts(VP10_COMMON *cm, FRAME_COUNTS *counts, comps->fp[i] += comps_t->fp[i]; } + for (i = 0; i < EXT_TX_SIZES; i++) { + int j; + for (j = 0; j < TX_TYPES; ++j) + for (k = 0; k < TX_TYPES; k++) + cm->counts.intra_ext_tx[i][j][k] += counts->intra_ext_tx[i][j][k]; + } + for (i = 0; i < EXT_TX_SIZES; i++) { + for (k = 0; k < TX_TYPES; k++) + cm->counts.inter_ext_tx[i][k] += counts->inter_ext_tx[i][k]; + } + #if CONFIG_MISC_FIXES for (i = 0; i < PREDICTION_PROBS; i++) for (j = 0; j < 2; j++) diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c index c0fbc4949..1c3f18239 100644 --- a/vp10/decoder/decodeframe.c +++ b/vp10/decoder/decodeframe.c @@ -268,7 +268,7 @@ static void inverse_transform_block_inter(MACROBLOCKD* xd, int plane, if (eob == 1) { dqcoeff[0] = 0; } else { - if (tx_size <= TX_16X16 && eob <= 10) + if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10) memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0])); else if (tx_size == TX_32X32 && eob <= 34) memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0])); @@ -2109,8 +2109,8 @@ static size_t read_uncompressed_header(VP10Decoder *pbi, setup_segmentation_dequant(cm); #if CONFIG_MISC_FIXES - cm->tx_mode = (xd->lossless[0]) ? ONLY_4X4 - : read_tx_mode(rb); + cm->tx_mode = (!cm->seg.enabled && xd->lossless[0]) ? ONLY_4X4 + : read_tx_mode(rb); cm->reference_mode = read_frame_reference_mode(cm, rb); #endif @@ -2124,6 +2124,23 @@ static size_t read_uncompressed_header(VP10Decoder *pbi, return sz; } +static void read_ext_tx_probs(FRAME_CONTEXT *fc, vpx_reader *r) { + int i, j, k; + if (vpx_read(r, GROUP_DIFF_UPDATE_PROB)) { + for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { + for (j = 0; j < TX_TYPES; ++j) + for (k = 0; k < TX_TYPES - 1; ++k) + vp10_diff_update_prob(r, &fc->intra_ext_tx_prob[i][j][k]); + } + } + if (vpx_read(r, GROUP_DIFF_UPDATE_PROB)) { + for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { + for (k = 0; k < TX_TYPES - 1; ++k) + vp10_diff_update_prob(r, &fc->inter_ext_tx_prob[i][k]); + } + } +} + static int read_compressed_header(VP10Decoder *pbi, const uint8_t *data, size_t partition_size) { VP10_COMMON *const cm = &pbi->common; @@ -2205,6 +2222,7 @@ static int read_compressed_header(VP10Decoder *pbi, const uint8_t *data, #endif read_mv_probs(nmvc, cm->allow_high_precision_mv, &r); + read_ext_tx_probs(fc, &r); } return vpx_reader_has_error(&r); @@ -2245,6 +2263,10 @@ static void debug_check_frame_counts(const VP10_COMMON *const cm) { assert(!memcmp(&cm->counts.tx, &zero_counts.tx, sizeof(cm->counts.tx))); assert(!memcmp(cm->counts.skip, zero_counts.skip, sizeof(cm->counts.skip))); assert(!memcmp(&cm->counts.mv, &zero_counts.mv, sizeof(cm->counts.mv))); + assert(!memcmp(cm->counts.intra_ext_tx, zero_counts.intra_ext_tx, + sizeof(cm->counts.intra_ext_tx))); + assert(!memcmp(cm->counts.inter_ext_tx, zero_counts.inter_ext_tx, + sizeof(cm->counts.inter_ext_tx))); } #endif // NDEBUG diff --git a/vp10/decoder/decodemv.c b/vp10/decoder/decodemv.c index b516333e0..a28ae5592 100644 --- a/vp10/decoder/decodemv.c +++ b/vp10/decoder/decodemv.c @@ -100,6 +100,8 @@ static TX_SIZE read_tx_size(VP10_COMMON *cm, MACROBLOCKD *xd, TX_MODE tx_mode = cm->tx_mode; BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type; const TX_SIZE max_tx_size = max_txsize_lookup[bsize]; + if (xd->lossless[xd->mi[0]->mbmi.segment_id]) + return TX_4X4; if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8) return read_selected_tx_size(cm, xd, max_tx_size, r); else @@ -294,6 +296,20 @@ static void read_intra_frame_mode_info(VP10_COMMON *const cm, } mbmi->uv_mode = read_intra_mode_uv(cm, xd, r, mbmi->mode); + + if (mbmi->tx_size < TX_32X32 && + cm->base_qindex > 0 && !mbmi->skip && + !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + FRAME_COUNTS *counts = xd->counts; + TX_TYPE tx_type_nom = intra_mode_to_tx_type_context[mbmi->mode]; + mbmi->tx_type = vpx_read_tree( + r, vp10_ext_tx_tree, + cm->fc->intra_ext_tx_prob[mbmi->tx_size][tx_type_nom]); + if (counts) + ++counts->intra_ext_tx[mbmi->tx_size][tx_type_nom][mbmi->tx_type]; + } else { + mbmi->tx_type = DCT_DCT; + } } static int read_mv_component(vpx_reader *r, @@ -650,6 +666,28 @@ static void read_inter_frame_mode_info(VP10Decoder *const pbi, read_inter_block_mode_info(pbi, xd, mi, mi_row, mi_col, r); else read_intra_block_mode_info(cm, xd, mi, r); + + if (mbmi->tx_size < TX_32X32 && + cm->base_qindex > 0 && !mbmi->skip && + !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + FRAME_COUNTS *counts = xd->counts; + if (inter_block) { + mbmi->tx_type = vpx_read_tree( + r, vp10_ext_tx_tree, + cm->fc->inter_ext_tx_prob[mbmi->tx_size]); + if (counts) + ++counts->inter_ext_tx[mbmi->tx_size][mbmi->tx_type]; + } else { + const TX_TYPE tx_type_nom = intra_mode_to_tx_type_context[mbmi->mode]; + mbmi->tx_type = vpx_read_tree( + r, vp10_ext_tx_tree, + cm->fc->intra_ext_tx_prob[mbmi->tx_size][tx_type_nom]); + if (counts) + ++counts->intra_ext_tx[mbmi->tx_size][tx_type_nom][mbmi->tx_type]; + } + } else { + mbmi->tx_type = DCT_DCT; + } } void vp10_read_mode_info(VP10Decoder *const pbi, MACROBLOCKD *xd, diff --git a/vp10/encoder/aq_complexity.c b/vp10/encoder/aq_complexity.c index 0de044cf9..2506a4e55 100644 --- a/vp10/encoder/aq_complexity.c +++ b/vp10/encoder/aq_complexity.c @@ -51,7 +51,7 @@ void vp10_setup_in_frame_q_adj(VP10_COMP *cpi) { // Make SURE use of floating point in this function is safe. vpx_clear_system_state(); - if (cm->frame_type == KEY_FRAME || + if (frame_is_intra_only(cm) || cm->error_resilient_mode || cpi->refresh_alt_ref_frame || (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { int segment; diff --git a/vp10/encoder/aq_variance.c b/vp10/encoder/aq_variance.c index e8e88c3b6..bed5162fb 100644 --- a/vp10/encoder/aq_variance.c +++ b/vp10/encoder/aq_variance.c @@ -47,7 +47,7 @@ void vp10_vaq_frame_setup(VP10_COMP *cpi) { struct segmentation *seg = &cm->seg; int i; - if (cm->frame_type == KEY_FRAME || + if (frame_is_intra_only(cm) || cm->error_resilient_mode || cpi->refresh_alt_ref_frame || (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { vp10_enable_segmentation(seg); diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c index 361ac9962..ede8bb370 100644 --- a/vp10/encoder/bitstream.c +++ b/vp10/encoder/bitstream.c @@ -58,6 +58,12 @@ static INLINE void write_uniform(vpx_writer *w, int n, int v) { } } +static struct vp10_token ext_tx_encodings[TX_TYPES]; + +void vp10_encode_token_init() { + vp10_tokens_from_tree(ext_tx_encodings, vp10_ext_tx_tree); +} + static void write_intra_mode(vpx_writer *w, PREDICTION_MODE mode, const vpx_prob *probs) { vp10_write_token(w, vp10_intra_mode_tree, probs, &intra_mode_encodings[mode]); @@ -90,6 +96,24 @@ static void prob_diff_update(const vpx_tree_index *tree, vp10_cond_prob_diff_update(w, &probs[i], branch_ct[i]); } +static int prob_diff_update_savings(const vpx_tree_index *tree, + vpx_prob probs[/*n - 1*/], + const unsigned int counts[/*n - 1*/], + int n) { + int i; + unsigned int branch_ct[32][2]; + int savings = 0; + + // Assuming max number of probabilities <= 32 + assert(n <= 32); + vp10_tree_probs_from_distribution(tree, branch_ct, counts); + for (i = 0; i < n - 1; ++i) { + savings += vp10_cond_prob_diff_update_savings(&probs[i], + branch_ct[i]); + } + return savings; +} + static void write_selected_tx_size(const VP10_COMMON *cm, const MACROBLOCKD *xd, vpx_writer *w) { TX_SIZE tx_size = xd->mi[0]->mbmi.tx_size; @@ -133,6 +157,49 @@ static void update_switchable_interp_probs(VP10_COMMON *cm, vpx_writer *w, counts->switchable_interp[j], SWITCHABLE_FILTERS, w); } +static void update_ext_tx_probs(VP10_COMMON *cm, vpx_writer *w) { + const int savings_thresh = vp10_cost_one(GROUP_DIFF_UPDATE_PROB) - + vp10_cost_zero(GROUP_DIFF_UPDATE_PROB); + int i, j; + + int savings = 0; + int do_update = 0; + for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { + for (j = 0; j < TX_TYPES; ++j) + savings += prob_diff_update_savings( + vp10_ext_tx_tree, cm->fc->intra_ext_tx_prob[i][j], + cm->counts.intra_ext_tx[i][j], TX_TYPES); + } + do_update = savings > savings_thresh; + vpx_write(w, do_update, GROUP_DIFF_UPDATE_PROB); + if (do_update) { + for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { + for (j = 0; j < TX_TYPES; ++j) + prob_diff_update(vp10_ext_tx_tree, + cm->fc->intra_ext_tx_prob[i][j], + cm->counts.intra_ext_tx[i][j], + TX_TYPES, w); + } + } + savings = 0; + do_update = 0; + for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { + savings += prob_diff_update_savings( + vp10_ext_tx_tree, cm->fc->inter_ext_tx_prob[i], + cm->counts.inter_ext_tx[i], TX_TYPES); + } + do_update = savings > savings_thresh; + vpx_write(w, do_update, GROUP_DIFF_UPDATE_PROB); + if (do_update) { + for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { + prob_diff_update(vp10_ext_tx_tree, + cm->fc->inter_ext_tx_prob[i], + cm->counts.inter_ext_tx[i], + TX_TYPES, w); + } + } +} + static void pack_mb_tokens(vpx_writer *w, TOKENEXTRA **tp, const TOKENEXTRA *const stop, vpx_bit_depth_t bit_depth, const TX_SIZE tx) { @@ -303,7 +370,7 @@ static void pack_inter_mode_mvs(VP10_COMP *cpi, const MODE_INFO *mi, vpx_write(w, is_inter, vp10_get_intra_inter_prob(cm, xd)); if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT && - !(is_inter && skip)) { + !(is_inter && skip) && !xd->lossless[segment_id]) { write_selected_tx_size(cm, xd, w); } @@ -370,6 +437,25 @@ static void pack_inter_mode_mvs(VP10_COMP *cpi, const MODE_INFO *mi, } } } + if (mbmi->tx_size < TX_32X32 && + cm->base_qindex > 0 && !mbmi->skip && + !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + if (is_inter) { + vp10_write_token( + w, vp10_ext_tx_tree, + cm->fc->inter_ext_tx_prob[mbmi->tx_size], + &ext_tx_encodings[mbmi->tx_type]); + } else { + vp10_write_token( + w, vp10_ext_tx_tree, + cm->fc->intra_ext_tx_prob[mbmi->tx_size] + [intra_mode_to_tx_type_context[mbmi->mode]], + &ext_tx_encodings[mbmi->tx_type]); + } + } else { + if (!mbmi->skip) + assert(mbmi->tx_type == DCT_DCT); + } } static void write_mb_modes_kf(const VP10_COMMON *cm, const MACROBLOCKD *xd, @@ -391,7 +477,8 @@ static void write_mb_modes_kf(const VP10_COMMON *cm, const MACROBLOCKD *xd, write_skip(cm, xd, mbmi->segment_id, mi, w); - if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT) + if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT && + !xd->lossless[mbmi->segment_id]) write_selected_tx_size(cm, xd, w); if (bsize >= BLOCK_8X8) { @@ -412,6 +499,16 @@ static void write_mb_modes_kf(const VP10_COMMON *cm, const MACROBLOCKD *xd, } write_intra_mode(w, mbmi->uv_mode, cm->fc->uv_mode_prob[mbmi->mode]); + + if (mbmi->tx_size < TX_32X32 && + cm->base_qindex > 0 && !mbmi->skip && + !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + vp10_write_token( + w, vp10_ext_tx_tree, + cm->fc->intra_ext_tx_prob[mbmi->tx_size] + [intra_mode_to_tx_type_context[mbmi->mode]], + &ext_tx_encodings[mbmi->tx_type]); + } } static void write_modes_b(VP10_COMP *cpi, const TileInfo *const tile, @@ -1260,7 +1357,7 @@ static void write_uncompressed_header(VP10_COMP *cpi, encode_quantization(cm, wb); encode_segmentation(cm, xd, wb); #if CONFIG_MISC_FIXES - if (xd->lossless[0]) + if (!cm->seg.enabled && xd->lossless[0]) cm->tx_mode = TX_4X4; else write_txfm_mode(cm->tx_mode, wb); @@ -1380,6 +1477,7 @@ static size_t write_compressed_header(VP10_COMP *cpi, uint8_t *data) { vp10_write_nmv_probs(cm, cm->allow_high_precision_mv, &header_bc, &counts->mv); + update_ext_tx_probs(cm, &header_bc); } vpx_stop_encode(&header_bc); diff --git a/vp10/encoder/bitstream.h b/vp10/encoder/bitstream.h index aa0ed2fdf..b1da89f1d 100644 --- a/vp10/encoder/bitstream.h +++ b/vp10/encoder/bitstream.h @@ -18,6 +18,7 @@ extern "C" { #include "vp10/encoder/encoder.h" +void vp10_encode_token_init(); void vp10_pack_bitstream(VP10_COMP *const cpi, uint8_t *dest, size_t *size); static INLINE int vp10_preserve_existing_gf(VP10_COMP *cpi) { diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c index 9381b653d..26ce5a1eb 100644 --- a/vp10/encoder/encodeframe.c +++ b/vp10/encoder/encodeframe.c @@ -3024,5 +3024,16 @@ static void encode_superblock(VP10_COMP *cpi, ThreadData *td, } ++td->counts->tx.tx_totals[mbmi->tx_size]; ++td->counts->tx.tx_totals[get_uv_tx_size(mbmi, &xd->plane[1])]; + if (mbmi->tx_size < TX_32X32 && + cm->base_qindex > 0 && !mbmi->skip && + !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + if (is_inter_block(mbmi)) { + ++td->counts->inter_ext_tx[mbmi->tx_size][mbmi->tx_type]; + } else { + ++td->counts->intra_ext_tx[mbmi->tx_size] + [intra_mode_to_tx_type_context[mbmi->mode]] + [mbmi->tx_type]; + } + } } } diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c index 175c6d855..9e3bec40e 100644 --- a/vp10/encoder/encoder.c +++ b/vp10/encoder/encoder.c @@ -328,6 +328,7 @@ void vp10_initialize_enc(void) { vp10_rc_init_minq_luts(); vp10_entropy_mv_init(); vp10_temporal_filter_init(); + vp10_encode_token_init(); init_done = 1; } } @@ -2654,7 +2655,7 @@ static void loopfilter_frame(VP10_COMP *cpi, VP10_COMMON *cm) { MACROBLOCKD *xd = &cpi->td.mb.e_mbd; struct loopfilter *lf = &cm->lf; if (is_lossless_requested(&cpi->oxcf)) { - lf->filter_level = 0; + lf->filter_level = 0; } else { struct vpx_usec_timer timer; diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h index 2a44e4744..bd6a00932 100644 --- a/vp10/encoder/encoder.h +++ b/vp10/encoder/encoder.h @@ -467,6 +467,8 @@ typedef struct VP10_COMP { int multi_arf_enabled; int multi_arf_last_grp_enabled; + int intra_tx_type_costs[EXT_TX_SIZES][TX_TYPES][TX_TYPES]; + int inter_tx_type_costs[EXT_TX_SIZES][TX_TYPES]; #if CONFIG_VP9_TEMPORAL_DENOISING VP9_DENOISER denoiser; #endif diff --git a/vp10/encoder/rd.c b/vp10/encoder/rd.c index 5623a7202..f4fdb2417 100644 --- a/vp10/encoder/rd.c +++ b/vp10/encoder/rd.c @@ -83,6 +83,18 @@ static void fill_mode_costs(VP10_COMP *cpi) { for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) vp10_cost_tokens(cpi->switchable_interp_costs[i], fc->switchable_interp_prob[i], vp10_switchable_interp_tree); + + for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { + for (j = 0; j < TX_TYPES; ++j) + vp10_cost_tokens(cpi->intra_tx_type_costs[i][j], + fc->intra_ext_tx_prob[i][j], + vp10_ext_tx_tree); + } + for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { + vp10_cost_tokens(cpi->inter_tx_type_costs[i], + fc->inter_ext_tx_prob[i], + vp10_ext_tx_tree); + } } static void fill_token_costs(vp10_coeff_cost *c, diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c index bbddc1d29..90a716d2c 100644 --- a/vp10/encoder/rdopt.c +++ b/vp10/encoder/rdopt.c @@ -54,6 +54,8 @@ #define MIN_EARLY_TERM_INDEX 3 #define NEW_MV_DISCOUNT_FACTOR 8 +const double ext_tx_th = 0.99; + typedef struct { PREDICTION_MODE mode; MV_REFERENCE_FRAME ref_frame[2]; @@ -598,11 +600,56 @@ static void choose_largest_tx_size(VP10_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + TX_TYPE tx_type, best_tx_type = DCT_DCT; + int r, s; + int64_t d, psse, this_rd, best_rd = INT64_MAX; + vpx_prob skip_prob = vp10_get_skip_prob(cm, xd); + int s0 = vp10_cost_bit(skip_prob, 0); + int s1 = vp10_cost_bit(skip_prob, 1); + const int is_inter = is_inter_block(mbmi); + mbmi->tx_size = VPXMIN(max_tx_size, largest_tx_size); + if (mbmi->tx_size < TX_32X32 && + !xd->lossless[mbmi->segment_id]) { + for (tx_type = 0; tx_type < TX_TYPES; ++tx_type) { + mbmi->tx_type = tx_type; + txfm_rd_in_plane(x, &r, &d, &s, + &psse, ref_best_rd, 0, bs, mbmi->tx_size, + cpi->sf.use_fast_coef_costing); + if (r == INT_MAX) + continue; + if (is_inter) + r += cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type]; + else + r += cpi->intra_tx_type_costs[mbmi->tx_size] + [intra_mode_to_tx_type_context[mbmi->mode]] + [mbmi->tx_type]; + if (s) + this_rd = RDCOST(x->rdmult, x->rddiv, s1, psse); + else + this_rd = RDCOST(x->rdmult, x->rddiv, r + s0, d); + if (is_inter && !xd->lossless[mbmi->segment_id] && !s) + this_rd = VPXMIN(this_rd, RDCOST(x->rdmult, x->rddiv, s1, psse)); + if (this_rd < ((best_tx_type == DCT_DCT) ? ext_tx_th : 1) * best_rd) { + best_rd = this_rd; + best_tx_type = mbmi->tx_type; + } + } + } + mbmi->tx_type = best_tx_type; txfm_rd_in_plane(x, rate, distortion, skip, sse, ref_best_rd, 0, bs, mbmi->tx_size, cpi->sf.use_fast_coef_costing); + if (mbmi->tx_size < TX_32X32 && !xd->lossless[mbmi->segment_id] && + *rate != INT_MAX) { + if (is_inter) + *rate += cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type]; + else + *rate += cpi->intra_tx_type_costs[mbmi->tx_size] + [intra_mode_to_tx_type_context[mbmi->mode]] + [mbmi->tx_type]; + } } static void choose_smallest_tx_size(VP10_COMP *cpi, MACROBLOCK *x, @@ -632,87 +679,115 @@ static void choose_tx_size_from_rd(VP10_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; vpx_prob skip_prob = vp10_get_skip_prob(cm, xd); - int r[TX_SIZES][2], s[TX_SIZES]; - int64_t d[TX_SIZES], sse[TX_SIZES]; - int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX}, - {INT64_MAX, INT64_MAX}, - {INT64_MAX, INT64_MAX}, - {INT64_MAX, INT64_MAX}}; + int r, s; + int64_t d, sse; + int64_t rd = INT64_MAX; int n, m; int s0, s1; - int64_t best_rd = INT64_MAX; + int64_t best_rd = INT64_MAX, last_rd = INT64_MAX; TX_SIZE best_tx = max_tx_size; int start_tx, end_tx; + const int tx_select = cm->tx_mode == TX_MODE_SELECT; + TX_TYPE tx_type, best_tx_type = DCT_DCT; + const int is_inter = is_inter_block(mbmi); const vpx_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs); assert(skip_prob > 0); s0 = vp10_cost_bit(skip_prob, 0); s1 = vp10_cost_bit(skip_prob, 1); - if (cm->tx_mode == TX_MODE_SELECT) { + if (tx_select) { start_tx = max_tx_size; end_tx = 0; } else { - TX_SIZE chosen_tx_size = VPXMIN(max_tx_size, - tx_mode_to_biggest_tx_size[cm->tx_mode]); + const TX_SIZE chosen_tx_size = + VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[cm->tx_mode]); start_tx = chosen_tx_size; end_tx = chosen_tx_size; } - for (n = start_tx; n >= end_tx; n--) { - int r_tx_size = 0; - for (m = 0; m <= n - (n == (int) max_tx_size); m++) { - if (m == n) - r_tx_size += vp10_cost_zero(tx_probs[m]); - else - r_tx_size += vp10_cost_one(tx_probs[m]); - } - txfm_rd_in_plane(x, &r[n][0], &d[n], &s[n], - &sse[n], ref_best_rd, 0, bs, n, - cpi->sf.use_fast_coef_costing); - r[n][1] = r[n][0]; - if (r[n][0] < INT_MAX) { - r[n][1] += r_tx_size; - } - if (d[n] == INT64_MAX || r[n][0] == INT_MAX) { - rd[n][0] = rd[n][1] = INT64_MAX; - } else if (s[n]) { - if (is_inter_block(mbmi)) { - rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, sse[n]); - r[n][1] -= r_tx_size; - } else { - rd[n][0] = RDCOST(x->rdmult, x->rddiv, s1, sse[n]); - rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1 + r_tx_size, sse[n]); + *distortion = INT64_MAX; + *rate = INT_MAX; + *skip = 0; + *psse = INT64_MAX; + + for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) { + last_rd = INT64_MAX; + for (n = start_tx; n >= end_tx; --n) { + int r_tx_size = 0; + for (m = 0; m <= n - (n == (int) max_tx_size); ++m) { + if (m == n) + r_tx_size += vp10_cost_zero(tx_probs[m]); + else + r_tx_size += vp10_cost_one(tx_probs[m]); } - } else { - rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]); - rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]); - } - if (is_inter_block(mbmi) && !xd->lossless[mbmi->segment_id] && - !s[n] && sse[n] != INT64_MAX) { - rd[n][0] = VPXMIN(rd[n][0], RDCOST(x->rdmult, x->rddiv, s1, sse[n])); - rd[n][1] = VPXMIN(rd[n][1], RDCOST(x->rdmult, x->rddiv, s1, sse[n])); - } + if (n >= TX_32X32 && tx_type != DCT_DCT) { + continue; + } + mbmi->tx_type = tx_type; + txfm_rd_in_plane(x, &r, &d, &s, + &sse, ref_best_rd, 0, bs, n, + cpi->sf.use_fast_coef_costing); + if (n < TX_32X32 && + !xd->lossless[xd->mi[0]->mbmi.segment_id] && + r != INT_MAX) { + if (is_inter) + r += cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type]; + else + r += cpi->intra_tx_type_costs[mbmi->tx_size] + [intra_mode_to_tx_type_context[mbmi->mode]] + [mbmi->tx_type]; + } - // Early termination in transform size search. - if (cpi->sf.tx_size_search_breakout && - (rd[n][1] == INT64_MAX || - (n < (int) max_tx_size && rd[n][1] > rd[n + 1][1]) || - s[n] == 1)) - break; + if (r == INT_MAX) + continue; - if (rd[n][1] < best_rd) { - best_tx = n; - best_rd = rd[n][1]; + if (s) { + if (is_inter) { + rd = RDCOST(x->rdmult, x->rddiv, s1, sse); + } else { + rd = RDCOST(x->rdmult, x->rddiv, s1 + r_tx_size * tx_select, sse); + } + } else { + rd = RDCOST(x->rdmult, x->rddiv, r + s0 + r_tx_size * tx_select, d); + } + + if (tx_select && !(s && is_inter)) + r += r_tx_size; + + if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] && !s) + rd = VPXMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, sse)); + + // Early termination in transform size search. + if (cpi->sf.tx_size_search_breakout && + (rd == INT64_MAX || + (s == 1 && tx_type != DCT_DCT && n < start_tx) || + (n < (int) max_tx_size && rd > last_rd))) + break; + + last_rd = rd; + if (rd < + (is_inter && best_tx_type == DCT_DCT ? ext_tx_th : 1) * + best_rd) { + best_tx = n; + best_rd = rd; + *distortion = d; + *rate = r; + *skip = s; + *psse = sse; + best_tx_type = mbmi->tx_type; + } } } - mbmi->tx_size = best_tx; - *distortion = d[mbmi->tx_size]; - *rate = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT]; - *skip = s[mbmi->tx_size]; - *psse = sse[mbmi->tx_size]; + mbmi->tx_size = best_tx; + mbmi->tx_type = best_tx_type; + if (mbmi->tx_size >= TX_32X32) + assert(mbmi->tx_type == DCT_DCT); + txfm_rd_in_plane(x, &r, &d, &s, + &sse, ref_best_rd, 0, bs, best_tx, + cpi->sf.use_fast_coef_costing); } static void super_block_yrd(VP10_COMP *cpi, MACROBLOCK *x, int *rate, @@ -1065,6 +1140,7 @@ static int64_t rd_pick_intra_sby_mode(VP10_COMP *cpi, MACROBLOCK *x, int this_rate, this_rate_tokenonly, s; int64_t this_distortion, this_rd; TX_SIZE best_tx = TX_4X4; + TX_TYPE best_tx_type = DCT_DCT; int *bmode_costs; const MODE_INFO *above_mi = xd->above_mi; const MODE_INFO *left_mi = xd->left_mi; @@ -1091,6 +1167,7 @@ static int64_t rd_pick_intra_sby_mode(VP10_COMP *cpi, MACROBLOCK *x, mode_selected = mode; best_rd = this_rd; best_tx = mic->mbmi.tx_size; + best_tx_type = mic->mbmi.tx_type; *rate = this_rate; *rate_tokenonly = this_rate_tokenonly; *distortion = this_distortion; @@ -1100,6 +1177,7 @@ static int64_t rd_pick_intra_sby_mode(VP10_COMP *cpi, MACROBLOCK *x, mic->mbmi.mode = mode_selected; mic->mbmi.tx_size = best_tx; + mic->mbmi.tx_type = best_tx_type; return best_rd; } diff --git a/vp10/encoder/segmentation.c b/vp10/encoder/segmentation.c index 6a20ee47d..677910fa3 100644 --- a/vp10/encoder/segmentation.c +++ b/vp10/encoder/segmentation.c @@ -273,7 +273,7 @@ void vp10_choose_segmap_coding_method(VP10_COMMON *cm, MACROBLOCKD *xd) { no_pred_cost = cost_segmap(no_pred_segcounts, no_pred_tree); // Key frames cannot use temporal prediction - if (!frame_is_intra_only(cm)) { + if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) { // Work out probability tree for coding those segments not // predicted using the temporal method and the cost. calc_segtree_probs(t_unpred_seg_counts, t_pred_tree, segp->tree_probs); @@ -300,6 +300,7 @@ void vp10_choose_segmap_coding_method(VP10_COMMON *cm, MACROBLOCKD *xd) { // Now choose which coding method to use. if (t_pred_cost < no_pred_cost) { + assert(!cm->error_resilient_mode); seg->temporal_update = 1; #if !CONFIG_MISC_FIXES memcpy(segp->tree_probs, t_pred_tree, sizeof(t_pred_tree)); diff --git a/vp10/encoder/subexp.c b/vp10/encoder/subexp.c index 67e820b1f..d4074775b 100644 --- a/vp10/encoder/subexp.c +++ b/vp10/encoder/subexp.c @@ -212,3 +212,12 @@ void vp10_cond_prob_diff_update(vpx_writer *w, vpx_prob *oldp, vpx_write(w, 0, upd); } } + +int vp10_cond_prob_diff_update_savings(vpx_prob *oldp, + const unsigned int ct[2]) { + const vpx_prob upd = DIFF_UPDATE_PROB; + vpx_prob newp = get_binary_prob(ct[0], ct[1]); + const int savings = vp10_prob_diff_update_savings_search(ct, *oldp, &newp, + upd); + return savings; +} diff --git a/vp10/encoder/subexp.h b/vp10/encoder/subexp.h index 04b96c0bd..091334f1f 100644 --- a/vp10/encoder/subexp.h +++ b/vp10/encoder/subexp.h @@ -37,6 +37,8 @@ int vp10_prob_diff_update_savings_search_model(const unsigned int *ct, vpx_prob upd, int stepsize); +int vp10_cond_prob_diff_update_savings(vpx_prob *oldp, + const unsigned int ct[2]); #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c index 77d1ff459..9545729fb 100644 --- a/vp9/common/vp9_mvref_common.c +++ b/vp9/common/vp9_mvref_common.c @@ -11,7 +11,7 @@ #include "vp9/common/vp9_mvref_common.h" -// This function searches the neighbourhood of a given MB/SB +// This function searches the neighborhood of a given MB/SB // to try and find candidate reference vectors. static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, @@ -24,7 +24,7 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type]; int different_ref_found = 0; int context_counter = 0; - const MV_REF *const prev_frame_mvs = cm->use_prev_frame_mvs ? + const MV_REF *const prev_frame_mvs = cm->use_prev_frame_mvs ? cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col : NULL; const TileInfo *const tile = &xd->tile; @@ -59,8 +59,8 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, for (; i < MVREF_NEIGHBOURS; ++i) { const POSITION *const mv_ref = &mv_ref_search[i]; if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { - const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row * - xd->mi_stride]->mbmi; + const MB_MODE_INFO *const candidate = + &xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]->mbmi; different_ref_found = 1; if (candidate->ref_frame[0] == ref_frame) @@ -71,7 +71,7 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, } // TODO(hkuang): Remove this sync after fixing pthread_cond_broadcast - // on windows platform. The sync here is unncessary if use_perv_frame_mvs + // on windows platform. The sync here is unnecessary if use_prev_frame_mvs // is 0. But after removing it, there will be hang in the unit test on windows // due to several threads waiting for a thread's signal. #if defined(_WIN32) && !HAVE_PTHREAD_H @@ -101,8 +101,8 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, for (i = 0; i < MVREF_NEIGHBOURS; ++i) { const POSITION *mv_ref = &mv_ref_search[i]; if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { - const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row - * xd->mi_stride]->mbmi; + const MB_MODE_INFO *const candidate = + &xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]->mbmi; // If the candidate is INTRA we don't want to consider its mv. IF_DIFF_REF_FRAME_ADD_MV(candidate, ref_frame, ref_sign_bias, @@ -156,16 +156,6 @@ void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, mi_row, mi_col, sync, data, mode_context); } -static void lower_mv_precision(MV *mv, int allow_hp) { - const int use_hp = allow_hp && vp9_use_mv_hp(mv); - if (!use_hp) { - if (mv->row & 1) - mv->row += (mv->row > 0 ? -1 : 1); - if (mv->col & 1) - mv->col += (mv->col > 0 ? -1 : 1); - } -} - void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, int_mv *mvlist, int_mv *nearest_mv, int_mv *near_mv) { diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h index bd216d433..22fbaf857 100644 --- a/vp9/common/vp9_mvref_common.h +++ b/vp9/common/vp9_mvref_common.h @@ -157,7 +157,7 @@ static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref, // This macro is used to add a motion vector mv_ref list if it isn't // already in the list. If it's the second motion vector it will also -// skip all additional processing and jump to done! +// skip all additional processing and jump to Done! #define ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done) \ do { \ if (refmv_count) { \ @@ -207,6 +207,16 @@ static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) { xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN); } +static INLINE void lower_mv_precision(MV *mv, int allow_hp) { + const int use_hp = allow_hp && vp9_use_mv_hp(mv); + if (!use_hp) { + if (mv->row & 1) + mv->row += (mv->row > 0 ? -1 : 1); + if (mv->col & 1) + mv->col += (mv->col > 0 ? -1 : 1); + } +} + typedef void (*find_mv_refs_sync)(void *const data, int mi_row); void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c index d8c14ecc8..37658dc94 100644 --- a/vp9/common/vp9_reconinter.c +++ b/vp9/common/vp9_reconinter.c @@ -190,6 +190,12 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block, // Co-ordinate of containing block to pixel precision. const int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)); const int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)); +#if CONFIG_BETTER_HW_COMPATIBILITY + assert(xd->mi[0]->mbmi.sb_type != BLOCK_4X8 && + xd->mi[0]->mbmi.sb_type != BLOCK_8X4); + assert(mv_q4.row == mv.row * (1 << (1 - pd->subsampling_y)) && + mv_q4.col == mv.col * (1 << (1 - pd->subsampling_x))); +#endif if (plane == 0) pre_buf->buf = xd->block_refs[ref]->buf->y_buffer; else if (plane == 1) diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 39e4dcfe3..e27634cdd 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -587,7 +587,12 @@ static void dec_build_inter_predictors(VP9Decoder *const pbi, MACROBLOCKD *xd, // Co-ordinate of containing block to pixel precision. int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)); int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)); - +#if CONFIG_BETTER_HW_COMPATIBILITY + assert(xd->mi[0]->mbmi.sb_type != BLOCK_4X8 && + xd->mi[0]->mbmi.sb_type != BLOCK_8X4); + assert(mv_q4.row == mv->row * (1 << (1 - pd->subsampling_y)) && + mv_q4.col == mv->col * (1 << (1 - pd->subsampling_x))); +#endif // Co-ordinate of the block to 1/16th pixel precision. x0_16 = (x_start + x) << SUBPEL_BITS; y0_16 = (y_start + y) << SUBPEL_BITS; @@ -714,6 +719,18 @@ static void dec_build_inter_predictors_sb(VP9Decoder *const pbi, const InterpKernel *kernel = vp9_filter_kernels[mi->mbmi.interp_filter]; const BLOCK_SIZE sb_type = mi->mbmi.sb_type; const int is_compound = has_second_ref(&mi->mbmi); + int ref; + + for (ref = 0; ref < 1 + is_compound; ++ref) { + const MV_REFERENCE_FRAME frame = mi->mbmi.ref_frame[ref]; + RefBuffer *ref_buf = &pbi->common.frame_refs[frame - LAST_FRAME]; + + xd->block_refs[ref] = ref_buf; + if (!vp9_is_valid_scale(&ref_buf->sf)) + vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM, + "Reference frame has invalid dimensions"); + vp9_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col, &ref_buf->sf); + } for (plane = 0; plane < MAX_MB_PLANE; ++plane) { struct macroblockd_plane *const pd = &xd->plane[plane]; diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index d3ca7b3fe..42f554591 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c @@ -284,12 +284,19 @@ static int read_mv_component(vpx_reader *r, return sign ? -mag : mag; } +// TODO(slavarnway): move to vp9_entropymv.h and replace vp9_use_mv_hp +#define COMPANDED_MVREF_THRESH 8 +static int use_mv_hp(const MV *ref) { + return (abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH && + (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH; +} + static INLINE void read_mv(vpx_reader *r, MV *mv, const MV *ref, const nmv_context *ctx, nmv_context_counts *counts, int allow_hp) { const MV_JOINT_TYPE joint_type = (MV_JOINT_TYPE)vpx_read_tree(r, vp9_mv_joint_tree, ctx->joints); - const int use_hp = allow_hp && vp9_use_mv_hp(ref); + const int use_hp = allow_hp && use_mv_hp(ref); MV diff = {0, 0}; if (mv_joint_vertical(joint_type)) @@ -476,12 +483,203 @@ static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd, } } +static void dec_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, int_mv *mvlist, + int_mv *nearest_mv, int_mv *near_mv, + int refmv_count) { + int i; + + // Make sure all the candidates are properly clamped etc + for (i = 0; i < refmv_count; ++i) { + lower_mv_precision(&mvlist[i].as_mv, allow_hp); + clamp_mv2(&mvlist[i].as_mv, xd); + } + *nearest_mv = mvlist[0]; + *near_mv = mvlist[1]; +} + static void fpm_sync(void *const data, int mi_row) { VP9Decoder *const pbi = (VP9Decoder *)data; vp9_frameworker_wait(pbi->frame_worker_owner, pbi->common.prev_frame, mi_row << MI_BLOCK_SIZE_LOG2); } +// This macro is used to add a motion vector mv_ref list if it isn't +// already in the list. If it's the second motion vector or early_break +// it will also skip all additional processing and jump to Done! +#define ADD_MV_REF_LIST_EB(mv, refmv_count, mv_ref_list, Done) \ + do { \ + if (refmv_count) { \ + if ((mv).as_int != (mv_ref_list)[0].as_int) { \ + (mv_ref_list)[(refmv_count)] = (mv); \ + refmv_count++; \ + goto Done; \ + } \ + } else { \ + (mv_ref_list)[(refmv_count)++] = (mv); \ + if (early_break) \ + goto Done; \ + } \ + } while (0) + +// If either reference frame is different, not INTRA, and they +// are different from each other scale and add the mv to our list. +#define IF_DIFF_REF_FRAME_ADD_MV_EB(mbmi, ref_frame, ref_sign_bias, \ + refmv_count, mv_ref_list, Done) \ + do { \ + if (is_inter_block(mbmi)) { \ + if ((mbmi)->ref_frame[0] != ref_frame) \ + ADD_MV_REF_LIST_EB(scale_mv((mbmi), 0, ref_frame, ref_sign_bias), \ + refmv_count, mv_ref_list, Done); \ + if (has_second_ref(mbmi) && \ + (mbmi)->ref_frame[1] != ref_frame && \ + (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \ + ADD_MV_REF_LIST_EB(scale_mv((mbmi), 1, ref_frame, ref_sign_bias), \ + refmv_count, mv_ref_list, Done); \ + } \ + } while (0) + +// This function searches the neighborhood of a given MB/SB +// to try and find candidate reference vectors. +static int dec_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, + MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, + const POSITION *const mv_ref_search, + int_mv *mv_ref_list, + int mi_row, int mi_col, + find_mv_refs_sync sync, void *const data) { + const int *ref_sign_bias = cm->ref_frame_sign_bias; + int i, refmv_count = 0; + int different_ref_found = 0; + const MV_REF *const prev_frame_mvs = cm->use_prev_frame_mvs ? + cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col : NULL; + const TileInfo *const tile = &xd->tile; + // If mode is nearestmv or newmv (uses nearestmv as a reference) then stop + // searching after the first mv is found. + const int early_break = (mi->mbmi.mode == NEARESTMV) || + (mi->mbmi.mode == NEWMV); + + // Blank the reference vector list + memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES); + + // Check the rest of the neighbors in much the same way + // as before except we don't need to keep track of sub blocks or + // mode counts. + for (i = 0; i < MVREF_NEIGHBOURS; ++i) { + const POSITION *const mv_ref = &mv_ref_search[i]; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { + const MB_MODE_INFO *const candidate = + &xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]->mbmi; + different_ref_found = 1; + + if (candidate->ref_frame[0] == ref_frame) + ADD_MV_REF_LIST_EB(candidate->mv[0], refmv_count, mv_ref_list, Done); + else if (candidate->ref_frame[1] == ref_frame) + ADD_MV_REF_LIST_EB(candidate->mv[1], refmv_count, mv_ref_list, Done); + } + } + + // TODO(hkuang): Remove this sync after fixing pthread_cond_broadcast + // on windows platform. The sync here is unnecessary if use_prev_frame_mvs + // is 0. But after removing it, there will be hang in the unit test on windows + // due to several threads waiting for a thread's signal. +#if defined(_WIN32) && !HAVE_PTHREAD_H + if (cm->frame_parallel_decode && sync != NULL) { + sync(data, mi_row); + } +#endif + + // Check the last frame's mode and mv info. + if (prev_frame_mvs) { + // Synchronize here for frame parallel decode if sync function is provided. + if (cm->frame_parallel_decode && sync != NULL) { + sync(data, mi_row); + } + + if (prev_frame_mvs->ref_frame[0] == ref_frame) { + ADD_MV_REF_LIST_EB(prev_frame_mvs->mv[0], refmv_count, mv_ref_list, Done); + } else if (prev_frame_mvs->ref_frame[1] == ref_frame) { + ADD_MV_REF_LIST_EB(prev_frame_mvs->mv[1], refmv_count, mv_ref_list, Done); + } + } + + // Since we couldn't find 2 mvs from the same reference frame + // go back through the neighbors and find motion vectors from + // different reference frames. + if (different_ref_found) { + for (i = 0; i < MVREF_NEIGHBOURS; ++i) { + const POSITION *mv_ref = &mv_ref_search[i]; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { + const MB_MODE_INFO *const candidate = + &xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]->mbmi; + + // If the candidate is INTRA we don't want to consider its mv. + IF_DIFF_REF_FRAME_ADD_MV_EB(candidate, ref_frame, ref_sign_bias, + refmv_count, mv_ref_list, Done); + } + } + } + + // Since we still don't have a candidate we'll try the last frame. + if (prev_frame_mvs) { + if (prev_frame_mvs->ref_frame[0] != ref_frame && + prev_frame_mvs->ref_frame[0] > INTRA_FRAME) { + int_mv mv = prev_frame_mvs->mv[0]; + if (ref_sign_bias[prev_frame_mvs->ref_frame[0]] != + ref_sign_bias[ref_frame]) { + mv.as_mv.row *= -1; + mv.as_mv.col *= -1; + } + ADD_MV_REF_LIST_EB(mv, refmv_count, mv_ref_list, Done); + } + + if (prev_frame_mvs->ref_frame[1] > INTRA_FRAME && + prev_frame_mvs->ref_frame[1] != ref_frame && + prev_frame_mvs->mv[1].as_int != prev_frame_mvs->mv[0].as_int) { + int_mv mv = prev_frame_mvs->mv[1]; + if (ref_sign_bias[prev_frame_mvs->ref_frame[1]] != + ref_sign_bias[ref_frame]) { + mv.as_mv.row *= -1; + mv.as_mv.col *= -1; + } + ADD_MV_REF_LIST_EB(mv, refmv_count, mv_ref_list, Done); + } + } + + if (mi->mbmi.mode == NEARMV) + refmv_count = MAX_MV_REF_CANDIDATES; + else + // we only care about the nearestmv for the remaining modes + refmv_count = 1; + + Done: + // Clamp vectors + for (i = 0; i < refmv_count; ++i) + clamp_mv_ref(&mv_ref_list[i].as_mv, xd); + + return refmv_count; +} + +static uint8_t get_mode_context(const VP9_COMMON *cm, const MACROBLOCKD *xd, + const POSITION *const mv_ref_search, + int mi_row, int mi_col) { + int i; + int context_counter = 0; + const TileInfo *const tile = &xd->tile; + + // Get mode count from nearest 2 blocks + for (i = 0; i < 2; ++i) { + const POSITION *const mv_ref = &mv_ref_search[i]; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { + const MODE_INFO *const candidate_mi = xd->mi[mv_ref->col + mv_ref->row * + xd->mi_stride]; + const MB_MODE_INFO *const candidate = &candidate_mi->mbmi; + // Keep counts for entropy encoding. + context_counter += mode_2_counter[candidate->mode]; + } + } + + return counter_to_context[context_counter]; +} + static void read_inter_block_mode_info(VP9Decoder *const pbi, MACROBLOCKD *const xd, MODE_INFO *const mi, @@ -491,26 +689,13 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi, const BLOCK_SIZE bsize = mbmi->sb_type; const int allow_hp = cm->allow_high_precision_mv; int_mv nearestmv[2], nearmv[2]; - int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES]; int ref, is_compound; - uint8_t inter_mode_ctx[MAX_REF_FRAMES]; + uint8_t inter_mode_ctx; + const POSITION *const mv_ref_search = mv_ref_blocks[bsize]; read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame); is_compound = has_second_ref(mbmi); - - for (ref = 0; ref < 1 + is_compound; ++ref) { - const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref]; - RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME]; - - xd->block_refs[ref] = ref_buf; - if ((!vp9_is_valid_scale(&ref_buf->sf))) - vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM, - "Reference frame has invalid dimensions"); - vp9_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col, - &ref_buf->sf); - vp9_find_mv_refs(cm, xd, mi, frame, ref_mvs[frame], - mi_row, mi_col, fpm_sync, (void *)pbi, inter_mode_ctx); - } + inter_mode_ctx = get_mode_context(cm, xd, mv_ref_search, mi_row, mi_col); if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { mbmi->mode = ZEROMV; @@ -521,14 +706,27 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi, } } else { if (bsize >= BLOCK_8X8) - mbmi->mode = read_inter_mode(cm, xd, r, - inter_mode_ctx[mbmi->ref_frame[0]]); - } - - if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) { - for (ref = 0; ref < 1 + is_compound; ++ref) { - vp9_find_best_ref_mvs(xd, allow_hp, ref_mvs[mbmi->ref_frame[ref]], - &nearestmv[ref], &nearmv[ref]); + mbmi->mode = read_inter_mode(cm, xd, r, inter_mode_ctx); + else + // Sub 8x8 blocks use the nearestmv as a ref_mv if the b_mode is NEWMV. + // Setting mode to NEARESTMV forces the search to stop after the nearestmv + // has been found. After b_modes have been read, mode will be overwritten + // by the last b_mode. + mbmi->mode = NEARESTMV; + + if (mbmi->mode != ZEROMV) { + for (ref = 0; ref < 1 + is_compound; ++ref) { + int_mv ref_mvs[MAX_MV_REF_CANDIDATES]; + const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref]; + int refmv_count; + + refmv_count = dec_find_mv_refs(cm, xd, mi, frame, mv_ref_search, + ref_mvs, mi_row, mi_col, fpm_sync, + (void *)pbi); + + dec_find_best_ref_mvs(xd, allow_hp, ref_mvs, &nearestmv[ref], + &nearmv[ref], refmv_count); + } } } @@ -546,7 +744,7 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi, for (idx = 0; idx < 2; idx += num_4x4_w) { int_mv block[2]; const int j = idy * 2 + idx; - b_mode = read_inter_mode(cm, xd, r, inter_mode_ctx[mbmi->ref_frame[0]]); + b_mode = read_inter_mode(cm, xd, r, inter_mode_ctx); if (b_mode == NEARESTMV || b_mode == NEARMV) { uint8_t dummy_mode_ctx[MAX_REF_FRAMES]; diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c index 0def2cf1f..63db214d1 100644 --- a/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -191,7 +191,8 @@ void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi, BLOCK_SIZE bsize, int64_t rate, int64_t dist, - int skip) { + int skip, + struct macroblock_plane *const p) { const VP9_COMMON *const cm = &cpi->common; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; const int bw = num_8x8_blocks_wide_lookup[bsize]; @@ -199,12 +200,25 @@ void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi, const int xmis = VPXMIN(cm->mi_cols - mi_col, bw); const int ymis = VPXMIN(cm->mi_rows - mi_row, bh); const int block_index = mi_row * cm->mi_cols + mi_col; - const int refresh_this_block = candidate_refresh_aq(cr, mbmi, rate, dist, - bsize); + int refresh_this_block = candidate_refresh_aq(cr, mbmi, rate, dist, bsize); // Default is to not update the refresh map. int new_map_value = cr->map[block_index]; int x = 0; int y = 0; + int is_skin = 0; + if (refresh_this_block == 0 && + bsize <= BLOCK_16X16 && + cpi->oxcf.content != VP9E_CONTENT_SCREEN) { + is_skin = vp9_compute_skin_block(p[0].src.buf, + p[1].src.buf, + p[2].src.buf, + p[0].src.stride, + p[1].src.stride, + bsize); + if (is_skin) + refresh_this_block = 1; + } + // If this block is labeled for refresh, check if we should reset the // segment_id. if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) { diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.h b/vp9/encoder/vp9_aq_cyclicrefresh.h index a5b38138b..edf0a973e 100644 --- a/vp9/encoder/vp9_aq_cyclicrefresh.h +++ b/vp9/encoder/vp9_aq_cyclicrefresh.h @@ -14,6 +14,8 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_blockd.h" +#include "vp9/encoder/vp9_block.h" +#include "vp9/encoder/vp9_skin_detection.h" #ifdef __cplusplus extern "C" { @@ -93,7 +95,8 @@ int vp9_cyclic_refresh_rc_bits_per_mb(const struct VP9_COMP *cpi, int i, void vp9_cyclic_refresh_update_segment(struct VP9_COMP *const cpi, MB_MODE_INFO *const mbmi, int mi_row, int mi_col, BLOCK_SIZE bsize, - int64_t rate, int64_t dist, int skip); + int64_t rate, int64_t dist, int skip, + struct macroblock_plane *const p); void vp9_cyclic_refresh_update_sb_postencode(struct VP9_COMP *const cpi, const MB_MODE_INFO *const mbmi, diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c index 93aa40ae9..6533902b3 100644 --- a/vp9/encoder/vp9_denoiser.c +++ b/vp9/encoder/vp9_denoiser.c @@ -333,20 +333,12 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb, int is_skin = 0; if (bs <= BLOCK_16X16 && denoiser->denoising_level >= kDenLow) { - // Take center pixel in block to determine is_skin. - const int y_width_shift = (4 << b_width_log2_lookup[bs]) >> 1; - const int y_height_shift = (4 << b_height_log2_lookup[bs]) >> 1; - const int uv_width_shift = y_width_shift >> 1; - const int uv_height_shift = y_height_shift >> 1; - const int stride = mb->plane[0].src.stride; - const int strideuv = mb->plane[1].src.stride; - const uint8_t ysource = - mb->plane[0].src.buf[y_height_shift * stride + y_width_shift]; - const uint8_t usource = - mb->plane[1].src.buf[uv_height_shift * strideuv + uv_width_shift]; - const uint8_t vsource = - mb->plane[2].src.buf[uv_height_shift * strideuv + uv_width_shift]; - is_skin = vp9_skin_pixel(ysource, usource, vsource); + is_skin = vp9_compute_skin_block(mb->plane[0].src.buf, + mb->plane[1].src.buf, + mb->plane[2].src.buf, + mb->plane[0].src.stride, + mb->plane[1].src.stride, + bs); } mv_col = ctx->best_sse_mv.as_mv.col; diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index cc4d1f14e..c07eee969 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -1045,7 +1045,7 @@ static void update_state(VP9_COMP *cpi, ThreadData *td, if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { vp9_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, mi_row, mi_col, bsize, ctx->rate, ctx->dist, - x->skip); + x->skip, p); } } @@ -1705,6 +1705,7 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td, MACROBLOCKD *const xd = &x->e_mbd; MODE_INFO *const mi = xd->mi[0]; MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + struct macroblock_plane *const p = x->plane; const struct segmentation *const seg = &cm->seg; const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type]; const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type]; @@ -1725,7 +1726,7 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td, } else { // Setting segmentation map for cyclic_refresh. vp9_cyclic_refresh_update_segment(cpi, mbmi, mi_row, mi_col, bsize, - ctx->rate, ctx->dist, x->skip); + ctx->rate, ctx->dist, x->skip, p); } vp9_init_plane_quantizers(cpi, x); } diff --git a/vp9/encoder/vp9_noise_estimate.c b/vp9/encoder/vp9_noise_estimate.c index 4befbb066..008a40afc 100644 --- a/vp9/encoder/vp9_noise_estimate.c +++ b/vp9/encoder/vp9_noise_estimate.c @@ -145,10 +145,6 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { const uint8_t *src_u = cpi->Source->u_buffer; const uint8_t *src_v = cpi->Source->v_buffer; const int src_uvstride = cpi->Source->uv_stride; - const int y_width_shift = (4 << b_width_log2_lookup[bsize]) >> 1; - const int y_height_shift = (4 << b_height_log2_lookup[bsize]) >> 1; - const int uv_width_shift = y_width_shift >> 1; - const int uv_height_shift = y_height_shift >> 1; int mi_row, mi_col; int num_low_motion = 0; int frame_low_motion = 1; @@ -173,13 +169,12 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { // been encoded as zero/low motion x (= thresh_consec_zeromv) frames // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all // 4 sub-blocks for 16x16 block. Also, avoid skin blocks. - const uint8_t ysource = - src_y[y_height_shift * src_ystride + y_width_shift]; - const uint8_t usource = - src_u[uv_height_shift * src_uvstride + uv_width_shift]; - const uint8_t vsource = - src_v[uv_height_shift * src_uvstride + uv_width_shift]; - int is_skin = vp9_skin_pixel(ysource, usource, vsource); + int is_skin = vp9_compute_skin_block(src_y, + src_u, + src_v, + src_ystride, + src_uvstride, + bsize); if (frame_low_motion && cr->consec_zero_mv[bl_index] > thresh_consec_zeromv && cr->consec_zero_mv[bl_index1] > thresh_consec_zeromv && diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index b929758ca..71b8bdba0 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -1263,9 +1263,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, ref_frame = ref_mode_set[idx].ref_frame; if (cpi->use_svc) { ref_frame = ref_mode_set_svc[idx].ref_frame; - if (svc_force_zero_mode[ref_frame - 1] && - frame_mv[this_mode][ref_frame].as_int != 0) - continue; } if (!(cpi->ref_frame_flags & flag_list[ref_frame])) @@ -1273,6 +1270,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (const_motion[ref_frame] && this_mode == NEARMV) continue; + if (cpi->use_svc) { + if (svc_force_zero_mode[ref_frame - 1] && + frame_mv[this_mode][ref_frame].as_int != 0) + continue; + } + if (!(frame_mv[this_mode][ref_frame].as_int == 0 && ref_frame == LAST_FRAME)) { i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME; @@ -1855,6 +1858,13 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, if (ref_frame_skip_mask & (1 << ref_frame)) continue; +#if CONFIG_BETTER_HW_COMPATIBILITY + if ((bsize == BLOCK_8X4 || bsize == BLOCK_4X8) && + ref_frame > INTRA_FRAME && + vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf)) + continue; +#endif + // TODO(jingning, agrange): Scaling reference frame not supported for // sub8x8 blocks. Is this supported now? if (ref_frame > INTRA_FRAME && diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 2a6b70703..b8d17205d 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -1349,11 +1349,25 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, const InterpKernel *kernel = vp9_filter_kernels[mi->mbmi.interp_filter]; for (ref = 0; ref < 1 + is_compound; ++ref) { - const uint8_t *pre = &pd->pre[ref].buf[vp9_raster_block_offset(BLOCK_8X8, i, - pd->pre[ref].stride)]; + const int bw = b_width_log2_lookup[BLOCK_8X8]; + const int h = 4 * (i >> bw); + const int w = 4 * (i & ((1 << bw) - 1)); + const struct scale_factors *sf = &xd->block_refs[ref]->sf; + int y_stride = pd->pre[ref].stride; + uint8_t *pre = pd->pre[ref].buf + (h * pd->pre[ref].stride + w); + + if (vp9_is_scaled(sf)) { + const int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)); + const int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)); + + y_stride = xd->block_refs[ref]->buf->y_stride; + pre = xd->block_refs[ref]->buf->y_buffer; + pre += scaled_buffer_offset(x_start + w, y_start + h, + y_stride, sf); + } #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vp9_highbd_build_inter_predictor(pre, pd->pre[ref].stride, + vp9_highbd_build_inter_predictor(pre, y_stride, dst, pd->dst.stride, &mi->bmi[i].as_mv[ref].as_mv, &xd->block_refs[ref]->sf, width, height, @@ -1361,7 +1375,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, mi_col * MI_SIZE + 4 * (i % 2), mi_row * MI_SIZE + 4 * (i / 2), xd->bd); } else { - vp9_build_inter_predictor(pre, pd->pre[ref].stride, + vp9_build_inter_predictor(pre, y_stride, dst, pd->dst.stride, &mi->bmi[i].as_mv[ref].as_mv, &xd->block_refs[ref]->sf, width, height, ref, @@ -1370,7 +1384,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, mi_row * MI_SIZE + 4 * (i / 2)); } #else - vp9_build_inter_predictor(pre, pd->pre[ref].stride, + vp9_build_inter_predictor(pre, y_stride, dst, pd->dst.stride, &mi->bmi[i].as_mv[ref].as_mv, &xd->block_refs[ref]->sf, width, height, ref, @@ -3021,7 +3035,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { if (!(cpi->ref_frame_flags & flag_list[ref_frame])) { // Skip checking missing references in both single and compound reference - // modes. Note that a mode will be skipped iff both reference frames + // modes. Note that a mode will be skipped if both reference frames // are masked out. ref_frame_skip_mask[0] |= (1 << ref_frame); ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; @@ -3804,6 +3818,16 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, ref_frame = vp9_ref_order[ref_index].ref_frame[0]; second_ref_frame = vp9_ref_order[ref_index].ref_frame[1]; +#if CONFIG_BETTER_HW_COMPATIBILITY + // forbid 8X4 and 4X8 partitions if any reference frame is scaled. + if (bsize == BLOCK_8X4 || bsize == BLOCK_4X8) { + int ref_scaled = vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf); + if (second_ref_frame > INTRA_FRAME) + ref_scaled += vp9_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf); + if (ref_scaled) + continue; + } +#endif // Look at the reference frame of the best mode so far and set the // skip mask to look at a subset of the remaining modes. if (ref_index > 2 && sf->mode_skip_start < MAX_MODES) { diff --git a/vp9/encoder/vp9_skin_detection.c b/vp9/encoder/vp9_skin_detection.c index c2763b7da..0ca166536 100644 --- a/vp9/encoder/vp9_skin_detection.c +++ b/vp9/encoder/vp9_skin_detection.c @@ -48,6 +48,20 @@ int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr) { return (evaluate_skin_color_difference(cb, cr) < skin_threshold); } +int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v, + int stride, int strideuv, int bsize) { + // Take center pixel in block to determine is_skin. + const int y_width_shift = (4 << b_width_log2_lookup[bsize]) >> 1; + const int y_height_shift = (4 << b_height_log2_lookup[bsize]) >> 1; + const int uv_width_shift = y_width_shift >> 1; + const int uv_height_shift = y_height_shift >> 1; + const uint8_t ysource = y[y_height_shift * stride + y_width_shift]; + const uint8_t usource = u[uv_height_shift * strideuv + uv_width_shift]; + const uint8_t vsource = v[uv_height_shift * strideuv + uv_width_shift]; + return vp9_skin_pixel(ysource, usource, vsource); +} + + #ifdef OUTPUT_YUV_SKINMAP // For viewing skin map on input source. void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) { diff --git a/vp9/encoder/vp9_skin_detection.h b/vp9/encoder/vp9_skin_detection.h index 0a87ef9f4..73f7c39d9 100644 --- a/vp9/encoder/vp9_skin_detection.h +++ b/vp9/encoder/vp9_skin_detection.h @@ -23,6 +23,9 @@ struct VP9_COMP; int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr); +int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v, + int stride, int strideuv, int bsize); + #ifdef OUTPUT_YUV_SKINMAP // For viewing skin map on input source. void vp9_compute_skin_map(struct VP9_COMP *const cpi, FILE *yuv_skinmap_file); diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 318d8100c..c5f0bad8f 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -394,7 +394,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, sf->intra_y_mode_bsize_mask[i] = INTRA_DC_TM_H_V; } else { for (i = 0; i < BLOCK_SIZES; ++i) - if (i >= BLOCK_16X16) + if (i > BLOCK_16X16) sf->intra_y_mode_bsize_mask[i] = INTRA_DC; else // Use H and V intra mode for block sizes <= 16X16. diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index a4e7eb19e..30a7d1013 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -36,6 +36,12 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { svc->current_superframe = 0; for (i = 0; i < REF_FRAMES; ++i) svc->ref_frame_index[i] = -1; + for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { + cpi->svc.ext_frame_flags[sl] = 0; + cpi->svc.ext_lst_fb_idx[sl] = 0; + cpi->svc.ext_gld_fb_idx[sl] = 1; + cpi->svc.ext_alt_fb_idx[sl] = 2; + } if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) { if (vpx_realloc_frame_buffer(&cpi->svc.empty_frame.img, @@ -566,6 +572,8 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { // Note that the check (cpi->ext_refresh_frame_flags_pending == 0) is // needed to support the case where the frame flags may be passed in via // vpx_codec_encode(), which can be used for the temporal-only svc case. + // TODO(marpan): Consider adding an enc_config parameter to better handle + // this case. if (cpi->ext_refresh_frame_flags_pending == 0) { int sl; cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode; diff --git a/vpx/vpx_image.h b/vpx/vpx_image.h index e9e952c48..7958c6980 100644 --- a/vpx/vpx_image.h +++ b/vpx/vpx_image.h @@ -28,7 +28,7 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures */ -#define VPX_IMAGE_ABI_VERSION (3) /**<\hideinitializer*/ +#define VPX_IMAGE_ABI_VERSION (4) /**<\hideinitializer*/ #define VPX_IMG_FMT_PLANAR 0x100 /**< Image is a planar format. */ diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index f71769918..6c6f15e51 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -288,13 +288,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_d153_predictor_4x4/; add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_v_predictor_4x4/, "$sse_x86inc"; + specialize qw/vpx_highbd_v_predictor_4x4/, "$sse2_x86inc"; add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_tm_predictor_4x4/, "$sse_x86inc"; + specialize qw/vpx_highbd_tm_predictor_4x4/, "$sse2_x86inc"; add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_dc_predictor_4x4/, "$sse_x86inc"; + specialize qw/vpx_highbd_dc_predictor_4x4/, "$sse2_x86inc"; add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_top_predictor_4x4/; @@ -387,7 +387,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_v_predictor_16x16/, "$sse2_x86inc"; add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_tm_predictor_16x16/, "$sse2_x86_64_x86inc"; + specialize qw/vpx_highbd_tm_predictor_16x16/, "$sse2_x86inc"; add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_predictor_16x16/, "$sse2_x86inc"; @@ -435,10 +435,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_v_predictor_32x32/, "$sse2_x86inc"; add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_tm_predictor_32x32/, "$sse2_x86_64_x86inc"; + specialize qw/vpx_highbd_tm_predictor_32x32/, "$sse2_x86inc"; add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_dc_predictor_32x32/, "$sse2_x86_64_x86inc"; + specialize qw/vpx_highbd_dc_predictor_32x32/, "$sse2_x86inc"; add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_top_predictor_32x32/; @@ -990,10 +990,10 @@ add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, specialize qw/vpx_sad8x4 msa/, "$sse2_x86inc"; add_proto qw/unsigned int vpx_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad4x8 msa/, "$sse_x86inc"; +specialize qw/vpx_sad4x8 msa/, "$sse2_x86inc"; add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad4x4 mmx neon msa/, "$sse_x86inc"; +specialize qw/vpx_sad4x4 mmx neon msa/, "$sse2_x86inc"; # # Avg @@ -1061,10 +1061,10 @@ add_proto qw/unsigned int vpx_sad8x4_avg/, "const uint8_t *src_ptr, int src_stri specialize qw/vpx_sad8x4_avg msa/, "$sse2_x86inc"; add_proto qw/unsigned int vpx_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad4x8_avg msa/, "$sse_x86inc"; +specialize qw/vpx_sad4x8_avg msa/, "$sse2_x86inc"; add_proto qw/unsigned int vpx_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -specialize qw/vpx_sad4x4_avg msa/, "$sse_x86inc"; +specialize qw/vpx_sad4x4_avg msa/, "$sse2_x86inc"; # # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally diff --git a/vpx_dsp/x86/highbd_intrapred_sse2.asm b/vpx_dsp/x86/highbd_intrapred_sse2.asm index b12d29c0a..c61b62104 100644 --- a/vpx_dsp/x86/highbd_intrapred_sse2.asm +++ b/vpx_dsp/x86/highbd_intrapred_sse2.asm @@ -17,24 +17,20 @@ pw_16: times 4 dd 16 pw_32: times 4 dd 32 SECTION .text -INIT_MMX sse +INIT_XMM sse2 cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset GET_GOT goffsetq movq m0, [aboveq] movq m2, [leftq] - DEFINE_ARGS dst, stride, one - mov oned, 0x0001 - pxor m1, m1 - movd m3, oned - pshufw m3, m3, 0x0 paddw m0, m2 - pmaddwd m0, m3 - packssdw m0, m1 - pmaddwd m0, m3 + pshuflw m1, m0, 0xe + paddw m0, m1 + pshuflw m1, m0, 0x1 + paddw m0, m1 paddw m0, [GLOBAL(pw_4)] psraw m0, 3 - pshufw m0, m0, 0x0 + pshuflw m0, m0, 0x0 movq [dstq ], m0 movq [dstq+strideq*2], m0 lea dstq, [dstq+strideq*4] @@ -122,30 +118,29 @@ cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset RESTORE_GOT REP_RET -%if ARCH_X86_64 INIT_XMM sse2 -cglobal highbd_dc_predictor_32x32, 4, 5, 9, dst, stride, above, left, goffset +cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset GET_GOT goffsetq - pxor m1, m1 mova m0, [aboveq] mova m2, [aboveq+16] mova m3, [aboveq+32] mova m4, [aboveq+48] - mova m5, [leftq] - mova m6, [leftq+16] - mova m7, [leftq+32] - mova m8, [leftq+48] + paddw m0, m2 + paddw m3, m4 + mova m2, [leftq] + mova m4, [leftq+16] + mova m5, [leftq+32] + mova m6, [leftq+48] + paddw m2, m4 + paddw m5, m6 + paddw m0, m3 + paddw m2, m5 + pxor m1, m1 + paddw m0, m2 DEFINE_ARGS dst, stride, stride3, lines4 lea stride3q, [strideq*3] mov lines4d, 8 - paddw m0, m2 - paddw m0, m3 - paddw m0, m4 - paddw m0, m5 - paddw m0, m6 - paddw m0, m7 - paddw m0, m8 movhlps m2, m0 paddw m0, m2 punpcklwd m0, m1 @@ -181,9 +176,8 @@ cglobal highbd_dc_predictor_32x32, 4, 5, 9, dst, stride, above, left, goffset RESTORE_GOT REP_RET -%endif -INIT_MMX sse +INIT_XMM sse2 cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above movq m0, [aboveq] movq [dstq ], m0 @@ -261,43 +255,44 @@ cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above jnz .loop REP_RET -INIT_MMX sse -cglobal highbd_tm_predictor_4x4, 5, 6, 5, dst, stride, above, left, bps, one +INIT_XMM sse2 +cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps movd m1, [aboveq-2] movq m0, [aboveq] - pshufw m1, m1, 0x0 + pshuflw m1, m1, 0x0 + movlhps m0, m0 ; t1 t2 t3 t4 t1 t2 t3 t4 + movlhps m1, m1 ; tl tl tl tl tl tl tl tl ; Get the values to compute the maximum value at this bit depth - mov oned, 1 - movd m3, oned + pcmpeqw m3, m3 movd m4, bpsd - pshufw m3, m3, 0x0 - DEFINE_ARGS dst, stride, line, left - mov lineq, -2 - mova m2, m3 + psubw m0, m1 ; t1-tl t2-tl t3-tl t4-tl psllw m3, m4 - add leftq, 8 - psubw m3, m2 ; max possible value - pxor m4, m4 ; min possible value - psubw m0, m1 -.loop: - movq m1, [leftq+lineq*4] - movq m2, [leftq+lineq*4+2] - pshufw m1, m1, 0x0 - pshufw m2, m2, 0x0 - paddw m1, m0 + pcmpeqw m2, m2 + pxor m4, m4 ; min possible value + pxor m3, m2 ; max possible value + mova m1, [leftq] + pshuflw m2, m1, 0x0 + pshuflw m5, m1, 0x55 + movlhps m2, m5 ; l1 l1 l1 l1 l2 l2 l2 l2 paddw m2, m0 ;Clamp to the bit-depth - pminsw m1, m3 pminsw m2, m3 - pmaxsw m1, m4 pmaxsw m2, m4 ;Store the values - movq [dstq ], m1 - movq [dstq+strideq*2], m2 + movq [dstq ], m2 + movhpd [dstq+strideq*2], m2 lea dstq, [dstq+strideq*4] - inc lineq - jnz .loop - REP_RET + pshuflw m2, m1, 0xaa + pshuflw m5, m1, 0xff + movlhps m2, m5 + paddw m2, m0 + ;Clamp to the bit-depth + pminsw m2, m3 + pmaxsw m2, m4 + ;Store the values + movq [dstq ], m2 + movhpd [dstq+strideq*2], m2 + RET INIT_XMM sse2 cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one @@ -343,63 +338,55 @@ cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one jnz .loop REP_RET -%if ARCH_X86_64 INIT_XMM sse2 -cglobal highbd_tm_predictor_16x16, 5, 6, 9, dst, stride, above, left, bps, one +cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps movd m2, [aboveq-2] mova m0, [aboveq] mova m1, [aboveq+16] pshuflw m2, m2, 0x0 ; Get the values to compute the maximum value at this bit depth - mov oned, 1 - pxor m7, m7 - pxor m8, m8 - pinsrw m7, oned, 0 - pinsrw m8, bpsd, 0 - pshuflw m7, m7, 0x0 + pcmpeqw m3, m3 + movd m4, bpsd + punpcklqdq m2, m2 + psllw m3, m4 + pcmpeqw m5, m5 + pxor m4, m4 ; min possible value + pxor m3, m5 ; max possible value DEFINE_ARGS dst, stride, line, left - punpcklqdq m7, m7 mov lineq, -8 - mova m5, m7 - punpcklqdq m2, m2 - psllw m7, m8 - add leftq, 32 - psubw m7, m5 ; max possible value - pxor m8, m8 ; min possible value psubw m0, m2 psubw m1, m2 .loop: - movd m2, [leftq+lineq*4] - movd m3, [leftq+lineq*4+2] - pshuflw m2, m2, 0x0 - pshuflw m3, m3, 0x0 - punpcklqdq m2, m2 - punpcklqdq m3, m3 - paddw m4, m2, m0 - paddw m5, m3, m0 + movd m7, [leftq] + pshuflw m5, m7, 0x0 + pshuflw m2, m7, 0x55 + punpcklqdq m5, m5 ; l1 l1 l1 l1 l1 l1 l1 l1 + punpcklqdq m2, m2 ; l2 l2 l2 l2 l2 l2 l2 l2 + paddw m6, m5, m0 ; t1-tl+l1 to t4-tl+l1 + paddw m5, m1 ; t5-tl+l1 to t8-tl+l1 + pminsw m6, m3 + pminsw m5, m3 + pmaxsw m6, m4 ; Clamp to the bit-depth + pmaxsw m5, m4 + mova [dstq ], m6 + mova [dstq +16], m5 + paddw m6, m2, m0 paddw m2, m1 - paddw m3, m1 - ;Clamp to the bit-depth - pminsw m4, m7 - pminsw m5, m7 - pminsw m2, m7 - pminsw m3, m7 - pmaxsw m4, m8 - pmaxsw m5, m8 - pmaxsw m2, m8 - pmaxsw m3, m8 - ;Store the values - mova [dstq ], m4 - mova [dstq+strideq*2 ], m5 - mova [dstq +16], m2 - mova [dstq+strideq*2+16], m3 + pminsw m6, m3 + pminsw m2, m3 + pmaxsw m6, m4 + pmaxsw m2, m4 + mova [dstq+strideq*2 ], m6 + mova [dstq+strideq*2+16], m2 lea dstq, [dstq+strideq*4] inc lineq + lea leftq, [leftq+4] + jnz .loop REP_RET INIT_XMM sse2 -cglobal highbd_tm_predictor_32x32, 5, 6, 12, dst, stride, above, left, bps, one +cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps movd m0, [aboveq-2] mova m1, [aboveq] mova m2, [aboveq+16] @@ -407,70 +394,60 @@ cglobal highbd_tm_predictor_32x32, 5, 6, 12, dst, stride, above, left, bps, one mova m4, [aboveq+48] pshuflw m0, m0, 0x0 ; Get the values to compute the maximum value at this bit depth - mov oned, 1 - pxor m10, m10 - pxor m11, m11 - pinsrw m10, oned, 0 - pinsrw m11, bpsd, 0 - pshuflw m10, m10, 0x0 + pcmpeqw m5, m5 + movd m6, bpsd + psllw m5, m6 + pcmpeqw m7, m7 + pxor m6, m6 ; min possible value + pxor m5, m7 ; max possible value + punpcklqdq m0, m0 DEFINE_ARGS dst, stride, line, left - punpcklqdq m10, m10 mov lineq, -16 - mova m5, m10 - punpcklqdq m0, m0 - psllw m10, m11 - add leftq, 64 - psubw m10, m5 ; max possible value - pxor m11, m11 ; min possible value psubw m1, m0 psubw m2, m0 psubw m3, m0 psubw m4, m0 .loop: - movd m5, [leftq+lineq*4] - movd m6, [leftq+lineq*4+2] - pshuflw m5, m5, 0x0 - pshuflw m6, m6, 0x0 - punpcklqdq m5, m5 - punpcklqdq m6, m6 - paddw m7, m5, m1 - paddw m8, m5, m2 - paddw m9, m5, m3 - paddw m5, m4 - ;Clamp these values to the bit-depth - pminsw m7, m10 - pminsw m8, m10 - pminsw m9, m10 - pminsw m5, m10 - pmaxsw m7, m11 - pmaxsw m8, m11 - pmaxsw m9, m11 - pmaxsw m5, m11 - ;Store these values - mova [dstq ], m7 - mova [dstq +16], m8 - mova [dstq +32], m9 - mova [dstq +48], m5 - paddw m7, m6, m1 - paddw m8, m6, m2 - paddw m9, m6, m3 - paddw m6, m4 - ;Clamp these values to the bit-depth - pminsw m7, m10 - pminsw m8, m10 - pminsw m9, m10 - pminsw m6, m10 - pmaxsw m7, m11 - pmaxsw m8, m11 - pmaxsw m9, m11 - pmaxsw m6, m11 - ;Store these values - mova [dstq+strideq*2 ], m7 - mova [dstq+strideq*2+16], m8 - mova [dstq+strideq*2+32], m9 - mova [dstq+strideq*2+48], m6 + movd m7, [leftq] + pshuflw m7, m7, 0x0 + punpcklqdq m7, m7 ; l1 l1 l1 l1 l1 l1 l1 l1 + paddw m0, m7, m1 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq ], m0 + paddw m0, m7, m2 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq +16], m0 + paddw m0, m7, m3 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq +32], m0 + paddw m0, m7, m4 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq +48], m0 + movd m7, [leftq+2] + pshuflw m7, m7, 0x0 + punpcklqdq m7, m7 ; l2 l2 l2 l2 l2 l2 l2 l2 + paddw m0, m7, m1 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq+strideq*2 ], m0 + paddw m0, m7, m2 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq+strideq*2+16], m0 + paddw m0, m7, m3 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq+strideq*2+32], m0 + paddw m0, m7, m4 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq+strideq*2+48], m0 lea dstq, [dstq+strideq*4] + lea leftq, [leftq+4] inc lineq jnz .loop REP_RET -%endif diff --git a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm index 22d52a2af..30ee81b68 100644 --- a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm +++ b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm @@ -79,20 +79,13 @@ SECTION .text %macro INC_SRC_BY_SRC_STRIDE 0 %if ARCH_X86=1 && CONFIG_PIC=1 - lea srcq, [srcq + src_stridemp*2] + add srcq, src_stridemp + add srcq, src_stridemp %else lea srcq, [srcq + src_strideq*2] %endif %endmacro -%macro INC_SRC_BY_SRC_2STRIDE 0 -%if ARCH_X86=1 && CONFIG_PIC=1 - lea srcq, [srcq + src_stridemp*4] -%else - lea srcq, [srcq + src_strideq*4] -%endif -%endmacro - %macro SUBPEL_VARIANCE 1-2 0 ; W %define bilin_filter_m bilin_filter_m_sse2 %define filter_idx_shift 5 @@ -984,8 +977,9 @@ SECTION .text .x_other_y_other_loop: movu m2, [srcq] movu m4, [srcq+2] - movu m3, [srcq+src_strideq*2] - movu m5, [srcq+src_strideq*2+2] + INC_SRC_BY_SRC_STRIDE + movu m3, [srcq] + movu m5, [srcq+2] pmullw m2, filter_x_a pmullw m4, filter_x_b paddw m2, filter_rnd @@ -1018,7 +1012,7 @@ SECTION .text SUM_SSE m0, m2, m4, m3, m6, m7 mova m0, m5 - INC_SRC_BY_SRC_2STRIDE + INC_SRC_BY_SRC_STRIDE lea dstq, [dstq + dst_strideq * 4] %if %2 == 1 ; avg add secq, sec_str diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c index b45331caa..81ec5dbdb 100644 --- a/vpx_dsp/x86/highbd_variance_sse2.c +++ b/vpx_dsp/x86/highbd_variance_sse2.c @@ -243,13 +243,18 @@ unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, } #if CONFIG_USE_X86INC +// The 2 unused parameters are place holders for PIC enabled build. +// These definitions are for functions defined in +// highbd_subpel_variance_impl_sse2.asm #define DECL(w, opt) \ int vpx_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \ ptrdiff_t src_stride, \ int x_offset, int y_offset, \ const uint16_t *dst, \ ptrdiff_t dst_stride, \ - int height, unsigned int *sse); + int height, \ + unsigned int *sse, \ + void *unused0, void *unused); #define DECLS(opt1, opt2) \ DECL(8, opt1); \ DECL(16, opt1) @@ -274,7 +279,7 @@ uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \ int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \ x_offset, y_offset, \ dst, dst_stride, h, \ - &sse); \ + &sse, NULL, NULL); \ if (w > wf) { \ unsigned int sse2; \ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \ @@ -282,19 +287,20 @@ uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \ x_offset, y_offset, \ dst + 16, \ dst_stride, \ - h, &sse2); \ + h, &sse2, \ + NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ x_offset, y_offset, \ dst + 32, dst_stride, \ - h, &sse2); \ + h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ src + 48, src_stride, x_offset, y_offset, \ - dst + 48, dst_stride, h, &sse2); \ + dst + 48, dst_stride, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ } \ @@ -312,7 +318,7 @@ uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \ int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \ x_offset, y_offset, \ dst, dst_stride, \ - h, &sse); \ + h, &sse, NULL, NULL); \ if (w > wf) { \ uint32_t sse2; \ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \ @@ -320,20 +326,21 @@ uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \ x_offset, y_offset, \ dst + 16, \ dst_stride, \ - h, &sse2); \ + h, &sse2, \ + NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ x_offset, y_offset, \ dst + 32, dst_stride, \ - h, &sse2); \ + h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \ x_offset, y_offset, \ dst + 48, dst_stride, \ - h, &sse2); \ + h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ } \ @@ -359,27 +366,27 @@ uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ src + (start_row * src_stride), src_stride, \ x_offset, y_offset, dst + (start_row * dst_stride), \ - dst_stride, height, &sse2); \ + dst_stride, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf) { \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ src + 16 + (start_row * src_stride), src_stride, \ x_offset, y_offset, dst + 16 + (start_row * dst_stride), \ - dst_stride, height, &sse2); \ + dst_stride, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ src + 32 + (start_row * src_stride), src_stride, \ x_offset, y_offset, dst + 32 + (start_row * dst_stride), \ - dst_stride, height, &sse2); \ + dst_stride, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ src + 48 + (start_row * src_stride), src_stride, \ x_offset, y_offset, dst + 48 + (start_row * dst_stride), \ - dst_stride, height, &sse2); \ + dst_stride, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ }\ @@ -410,6 +417,7 @@ FNS(sse2, sse); #undef FNS #undef FN +// The 2 unused parameters are place holders for PIC enabled build. #define DECL(w, opt) \ int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \ ptrdiff_t src_stride, \ @@ -419,7 +427,8 @@ int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \ const uint16_t *sec, \ ptrdiff_t sec_stride, \ int height, \ - unsigned int *sse); + unsigned int *sse, \ + void *unused0, void *unused); #define DECLS(opt1) \ DECL(16, opt1) \ DECL(8, opt1) @@ -439,23 +448,23 @@ uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src, src_stride, x_offset, \ - y_offset, dst, dst_stride, sec, w, h, &sse); \ + y_offset, dst, dst_stride, sec, w, h, &sse, NULL, NULL); \ if (w > wf) { \ uint32_t sse2; \ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 16, src_stride, x_offset, y_offset, \ - dst + 16, dst_stride, sec + 16, w, h, &sse2); \ + dst + 16, dst_stride, sec + 16, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 32, src_stride, x_offset, y_offset, \ - dst + 32, dst_stride, sec + 32, w, h, &sse2); \ + dst + 32, dst_stride, sec + 32, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 48, src_stride, x_offset, y_offset, \ - dst + 48, dst_stride, sec + 48, w, h, &sse2); \ + dst + 48, dst_stride, sec + 48, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ } \ @@ -475,14 +484,15 @@ uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \ int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src, src_stride, x_offset, \ y_offset, dst, dst_stride, \ - sec, w, h, &sse); \ + sec, w, h, &sse, NULL, NULL); \ if (w > wf) { \ uint32_t sse2; \ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 16, src_stride, \ x_offset, y_offset, \ dst + 16, dst_stride, \ - sec + 16, w, h, &sse2); \ + sec + 16, w, h, &sse2, \ + NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ @@ -490,14 +500,16 @@ uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \ src + 32, src_stride, \ x_offset, y_offset, \ dst + 32, dst_stride, \ - sec + 32, w, h, &sse2); \ + sec + 32, w, h, &sse2, \ + NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 48, src_stride, \ x_offset, y_offset, \ dst + 48, dst_stride, \ - sec + 48, w, h, &sse2); \ + sec + 48, w, h, &sse2, \ + NULL, NULL); \ se += se2; \ sse += sse2; \ } \ @@ -525,7 +537,7 @@ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + (start_row * src_stride), src_stride, x_offset, \ y_offset, dst + (start_row * dst_stride), dst_stride, \ - sec + (start_row * w), w, height, &sse2); \ + sec + (start_row * w), w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf) { \ @@ -533,7 +545,7 @@ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ src + 16 + (start_row * src_stride), src_stride, \ x_offset, y_offset, \ dst + 16 + (start_row * dst_stride), dst_stride, \ - sec + 16 + (start_row * w), w, height, &sse2); \ + sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf * 2) { \ @@ -541,14 +553,14 @@ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ src + 32 + (start_row * src_stride), src_stride, \ x_offset, y_offset, \ dst + 32 + (start_row * dst_stride), dst_stride, \ - sec + 32 + (start_row * w), w, height, &sse2); \ + sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 48 + (start_row * src_stride), src_stride, \ x_offset, y_offset, \ dst + 48 + (start_row * dst_stride), dst_stride, \ - sec + 48 + (start_row * w), w, height, &sse2); \ + sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ } \ diff --git a/vpx_dsp/x86/sad_sse2.asm b/vpx_dsp/x86/sad_sse2.asm index 0defe1b6d..1ec906c23 100644 --- a/vpx_dsp/x86/sad_sse2.asm +++ b/vpx_dsp/x86/sad_sse2.asm @@ -17,7 +17,7 @@ SECTION .text %if %3 == 5 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows %else ; %3 == 7 -cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \ +cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ src_stride3, ref_stride3, n_rows %endif ; %3 == 5/7 %else ; avg @@ -25,7 +25,7 @@ cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \ cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ second_pred, n_rows %else ; %3 == 7 -cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \ +cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \ ref, ref_stride, \ second_pred, \ src_stride3, ref_stride3 @@ -222,8 +222,8 @@ SAD8XN 16, 1 ; sad8x16_avg_sse2 SAD8XN 8, 1 ; sad8x8_avg_sse2 SAD8XN 4, 1 ; sad8x4_avg_sse2 -; unsigned int vpx_sad4x{4, 8}_sse(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); +; unsigned int vpx_sad4x{4, 8}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); %macro SAD4XN 1-2 0 SAD_FN 4, %1, 7, %2 mov n_rowsd, %1/4 @@ -236,31 +236,32 @@ SAD8XN 4, 1 ; sad8x4_avg_sse2 movd m4, [refq+ref_stride3q] punpckldq m1, m2 punpckldq m3, m4 + movlhps m1, m3 %if %2 == 1 pavgb m1, [second_predq+mmsize*0] - pavgb m3, [second_predq+mmsize*1] - lea second_predq, [second_predq+mmsize*2] + lea second_predq, [second_predq+mmsize*1] %endif movd m2, [srcq] movd m5, [srcq+src_strideq] movd m4, [srcq+src_strideq*2] - movd m6, [srcq+src_stride3q] + movd m3, [srcq+src_stride3q] punpckldq m2, m5 - punpckldq m4, m6 + punpckldq m4, m3 + movlhps m2, m4 psadbw m1, m2 - psadbw m3, m4 lea refq, [refq+ref_strideq*4] paddd m0, m1 lea srcq, [srcq+src_strideq*4] - paddd m0, m3 dec n_rowsd jg .loop + movhlps m1, m0 + paddd m0, m1 movd eax, m0 RET %endmacro -INIT_MMX sse +INIT_XMM sse2 SAD4XN 8 ; sad4x8_sse SAD4XN 4 ; sad4x4_sse SAD4XN 8, 1 ; sad4x8_avg_sse |