diff options
Diffstat (limited to 'vp8/common')
-rw-r--r-- | vp8/common/blockd.h | 45 | ||||
-rw-r--r-- | vp8/common/default_coef_probs.h | 24 | ||||
-rw-r--r-- | vp8/common/entropy.c | 15 | ||||
-rw-r--r-- | vp8/common/entropy.h | 2 | ||||
-rw-r--r-- | vp8/common/entropymode.c | 21 | ||||
-rw-r--r-- | vp8/common/entropymode.h | 8 | ||||
-rw-r--r-- | vp8/common/entropymv.c | 402 | ||||
-rw-r--r-- | vp8/common/entropymv.h | 62 | ||||
-rw-r--r-- | vp8/common/findnearmv.c | 27 | ||||
-rw-r--r-- | vp8/common/idct.h | 3 | ||||
-rw-r--r-- | vp8/common/idctllm.c | 8 | ||||
-rw-r--r-- | vp8/common/loopfilter_filters.c | 6 | ||||
-rw-r--r-- | vp8/common/onyxc_int.h | 42 | ||||
-rw-r--r-- | vp8/common/postproc.c | 7 | ||||
-rw-r--r-- | vp8/common/pred_common.c | 5 | ||||
-rw-r--r-- | vp8/common/pred_common.h | 5 | ||||
-rw-r--r-- | vp8/common/reconinter.c | 10 | ||||
-rw-r--r-- | vp8/common/rtcd_defs.sh | 219 | ||||
-rw-r--r-- | vp8/common/sadmxn.h | 37 | ||||
-rw-r--r-- | vp8/common/x86/loopfilter_mmx.asm | 784 | ||||
-rw-r--r-- | vp8/common/x86/loopfilter_sse2.asm | 523 | ||||
-rw-r--r-- | vp8/common/x86/loopfilter_x86.c | 430 |
22 files changed, 704 insertions, 1981 deletions
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h index 3ab4cc3a9..cb546e74b 100644 --- a/vp8/common/blockd.h +++ b/vp8/common/blockd.h @@ -86,9 +86,7 @@ typedef enum BILINEAR = 1, EIGHTTAP = 2, EIGHTTAP_SHARP = 3, -#if CONFIG_SWITCHABLE_INTERP SWITCHABLE /* should be the last one */ -#endif } INTERPOLATIONFILTERTYPE; typedef enum @@ -135,14 +133,12 @@ typedef enum { TX_SIZE_MAX // Number of different transforms available } TX_SIZE; -#if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16 typedef enum { DCT_DCT = 0, // DCT in both horizontal and vertical - ADST_DCT = 1, // ADST in horizontal, DCT in vertical - DCT_ADST = 2, // DCT in horizontal, ADST in vertical + ADST_DCT = 1, // ADST in vertical, DCT in horizontal + DCT_ADST = 2, // DCT in vertical, ADST in horizontal ADST_ADST = 3 // ADST in both directions } TX_TYPE; -#endif #define VP8_YMODES (B_PRED + 1) #define VP8_UV_MODES (TM_PRED + 1) @@ -177,6 +173,14 @@ typedef enum { #define VP8_BINTRAMODES (B_HU_PRED + 1) /* 10 */ #define VP8_SUBMVREFS (1 + NEW4X4 - LEFT4X4) +typedef enum { + PARTITIONING_16X8 = 0, + PARTITIONING_8X16, + PARTITIONING_8X8, + PARTITIONING_4X4, + NB_PARTITIONINGS, +} SPLITMV_PARTITIONING_TYPE; + /* For keyframes, intra block modes are predicted by the (already decoded) modes for the Y blocks to the left and above us; for interframes, there is a single probability table. */ @@ -184,9 +188,7 @@ typedef enum { union b_mode_info { struct { B_PREDICTION_MODE first; -#if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16 TX_TYPE tx_type; -#endif #if CONFIG_COMP_INTRA_PRED B_PREDICTION_MODE second; @@ -220,7 +222,7 @@ typedef struct { int mv_ref_index[MAX_REF_FRAMES]; #endif - unsigned char partitioning; + SPLITMV_PARTITIONING_TYPE partitioning; unsigned char mb_skip_coeff; /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */ unsigned char need_to_clamp_mvs; unsigned char need_to_clamp_secondmv; @@ -239,9 +241,7 @@ typedef struct { // Flag to turn prediction signal filter on(1)/off(0 ) at the MB level unsigned int pred_filter_enabled; #endif -#if CONFIG_SWITCHABLE_INTERP INTERPOLATIONFILTERTYPE interp_filter; -#endif #if CONFIG_SUPERBLOCKS // FIXME need a SB array of 4 MB_MODE_INFOs that @@ -388,17 +388,11 @@ typedef struct MacroBlockD { } MACROBLOCKD; -#if CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16 #define ACTIVE_HT 110 // quantization stepsize threshold -#endif -#if CONFIG_HYBRIDTRANSFORM8X8 #define ACTIVE_HT8 300 -#endif -#if CONFIG_HYBRIDTRANSFORM16X16 #define ACTIVE_HT16 300 -#endif // convert MB_PREDICTION_MODE to B_PREDICTION_MODE static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) { @@ -442,7 +436,6 @@ static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) { return b_mode; } -#if CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM16X16 // transform mapping static TX_TYPE txfm_map(B_PREDICTION_MODE bmode) { // map transform type @@ -470,9 +463,7 @@ static TX_TYPE txfm_map(B_PREDICTION_MODE bmode) { } return tx_type; } -#endif -#if CONFIG_HYBRIDTRANSFORM static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, const BLOCKD *b) { TX_TYPE tx_type = DCT_DCT; if (xd->mode_info_context->mbmi.mode == B_PRED && @@ -481,9 +472,7 @@ static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, const BLOCKD *b) { } return tx_type; } -#endif -#if CONFIG_HYBRIDTRANSFORM8X8 static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, const BLOCKD *b) { TX_TYPE tx_type = DCT_DCT; if (xd->mode_info_context->mbmi.mode == I8X8_PRED && @@ -492,9 +481,7 @@ static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, const BLOCKD *b) { } return tx_type; } -#endif -#if CONFIG_HYBRIDTRANSFORM16X16 static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, const BLOCKD *b) { TX_TYPE tx_type = DCT_DCT; if (xd->mode_info_context->mbmi.mode < I8X8_PRED && @@ -503,34 +490,24 @@ static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, const BLOCKD *b) { } return tx_type; } -#endif -#if CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM8X8 || \ - CONFIG_HYBRIDTRANSFORM16X16 static TX_TYPE get_tx_type(const MACROBLOCKD *xd, const BLOCKD *b) { TX_TYPE tx_type = DCT_DCT; int ib = (b - xd->block); if (ib >= 16) return tx_type; -#if CONFIG_HYBRIDTRANSFORM16X16 if (xd->mode_info_context->mbmi.txfm_size == TX_16X16) { tx_type = get_tx_type_16x16(xd, b); } -#endif -#if CONFIG_HYBRIDTRANSFORM8X8 if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) { ib = (ib & 8) + ((ib & 4) >> 1); tx_type = get_tx_type_8x8(xd, &xd->block[ib]); } -#endif -#if CONFIG_HYBRIDTRANSFORM if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) { tx_type = get_tx_type_4x4(xd, b); } -#endif return tx_type; } -#endif extern void vp8_build_block_doffsets(MACROBLOCKD *xd); extern void vp8_setup_block_dptrs(MACROBLOCKD *xd); diff --git a/vp8/common/default_coef_probs.h b/vp8/common/default_coef_probs.h index 5e21195ee..bd1f795d0 100644 --- a/vp8/common/default_coef_probs.h +++ b/vp8/common/default_coef_probs.h @@ -13,9 +13,9 @@ static const vp8_prob default_coef_probs [BLOCK_TYPES] -[COEF_BANDS] -[PREV_COEF_CONTEXTS] -[ENTROPY_NODES] = { + [COEF_BANDS] + [PREV_COEF_CONTEXTS] + [ENTROPY_NODES] = { { /* Block Type ( 0 ) */ { @@ -254,11 +254,10 @@ static const vp8_prob default_coef_probs [BLOCK_TYPES] } }; -#if CONFIG_HYBRIDTRANSFORM static const vp8_prob default_hybrid_coef_probs [BLOCK_TYPES] -[COEF_BANDS] -[PREV_COEF_CONTEXTS] -[ENTROPY_NODES] = { + [COEF_BANDS] + [PREV_COEF_CONTEXTS] + [ENTROPY_NODES] = { { /* Block Type ( 0 ) */ { @@ -496,7 +495,6 @@ static const vp8_prob default_hybrid_coef_probs [BLOCK_TYPES] } } }; -#endif static const vp8_prob default_coef_probs_8x8[BLOCK_TYPES_8X8] @@ -731,12 +729,11 @@ default_coef_probs_8x8[BLOCK_TYPES_8X8] } }; -#if CONFIG_HYBRIDTRANSFORM8X8 static const vp8_prob default_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8] - [COEF_BANDS] - [PREV_COEF_CONTEXTS] - [ENTROPY_NODES] = { + [COEF_BANDS] + [PREV_COEF_CONTEXTS] + [ENTROPY_NODES] = { { /* block Type 0 */ { @@ -964,7 +961,6 @@ default_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8] } } }; -#endif static const vp8_prob default_coef_probs_16x16[BLOCK_TYPES_16X16] @@ -1173,7 +1169,6 @@ static const vp8_prob } }; -#if CONFIG_HYBRIDTRANSFORM16X16 static const vp8_prob default_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16] [COEF_BANDS] @@ -1380,4 +1375,3 @@ static const vp8_prob } } }; -#endif diff --git a/vp8/common/entropy.c b/vp8/common/entropy.c index 90f7a52c2..a3f731a3c 100644 --- a/vp8/common/entropy.c +++ b/vp8/common/entropy.c @@ -64,8 +64,6 @@ DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) = { 7, 11, 14, 15, }; - -#if CONFIG_HYBRIDTRANSFORM DECLARE_ALIGNED(16, const int, vp8_col_scan[16]) = { 0, 4, 8, 12, 1, 5, 9, 13, @@ -78,7 +76,6 @@ DECLARE_ALIGNED(16, const int, vp8_row_scan[16]) = { 8, 9, 10, 11, 12, 13, 14, 15 }; -#endif DECLARE_ALIGNED(64, const int, vp8_coef_bands_8x8[64]) = { 0, 1, 2, 3, 5, 4, 4, 5, @@ -208,25 +205,19 @@ vp8_extra_bit_struct vp8_extra_bits[12] = { void vp8_default_coef_probs(VP8_COMMON *pc) { vpx_memcpy(pc->fc.coef_probs, default_coef_probs, sizeof(pc->fc.coef_probs)); -#if CONFIG_HYBRIDTRANSFORM vpx_memcpy(pc->fc.hybrid_coef_probs, default_hybrid_coef_probs, sizeof(pc->fc.hybrid_coef_probs)); -#endif vpx_memcpy(pc->fc.coef_probs_8x8, default_coef_probs_8x8, sizeof(pc->fc.coef_probs_8x8)); -#if CONFIG_HYBRIDTRANSFORM8X8 vpx_memcpy(pc->fc.hybrid_coef_probs_8x8, default_hybrid_coef_probs_8x8, sizeof(pc->fc.hybrid_coef_probs_8x8)); -#endif vpx_memcpy(pc->fc.coef_probs_16x16, default_coef_probs_16x16, sizeof(pc->fc.coef_probs_16x16)); -#if CONFIG_HYBRIDTRANSFORM16X16 vpx_memcpy(pc->fc.hybrid_coef_probs_16x16, default_hybrid_coef_probs_16x16, sizeof(pc->fc.hybrid_coef_probs_16x16)); -#endif } void vp8_coef_tree_initialize() { @@ -344,7 +335,6 @@ void vp8_adapt_coef_probs(VP8_COMMON *cm) { } } -#if CONFIG_HYBRIDTRANSFORM for (i = 0; i < BLOCK_TYPES; ++i) for (j = 0; j < COEF_BANDS; ++j) for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { @@ -366,7 +356,6 @@ void vp8_adapt_coef_probs(VP8_COMMON *cm) { else cm->fc.hybrid_coef_probs[i][j][k][t] = prob; } } -#endif for (i = 0; i < BLOCK_TYPES_8X8; ++i) for (j = 0; j < COEF_BANDS; ++j) @@ -390,7 +379,6 @@ void vp8_adapt_coef_probs(VP8_COMMON *cm) { } } -#if CONFIG_HYBRIDTRANSFORM8X8 for (i = 0; i < BLOCK_TYPES_8X8; ++i) for (j = 0; j < COEF_BANDS; ++j) for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { @@ -413,7 +401,6 @@ void vp8_adapt_coef_probs(VP8_COMMON *cm) { else cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = prob; } } -#endif for (i = 0; i < BLOCK_TYPES_16X16; ++i) for (j = 0; j < COEF_BANDS; ++j) @@ -437,7 +424,6 @@ void vp8_adapt_coef_probs(VP8_COMMON *cm) { } } -#if CONFIG_HYBRIDTRANSFORM16X16 for (i = 0; i < BLOCK_TYPES_16X16; ++i) for (j = 0; j < COEF_BANDS; ++j) for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { @@ -458,5 +444,4 @@ void vp8_adapt_coef_probs(VP8_COMMON *cm) { else cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = prob; } } -#endif } diff --git a/vp8/common/entropy.h b/vp8/common/entropy.h index b9dfb344f..48a100ac6 100644 --- a/vp8/common/entropy.h +++ b/vp8/common/entropy.h @@ -104,10 +104,8 @@ struct VP8Common; void vp8_default_coef_probs(struct VP8Common *); extern DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]); -#if CONFIG_HYBRIDTRANSFORM extern DECLARE_ALIGNED(16, const int, vp8_col_scan[16]); extern DECLARE_ALIGNED(16, const int, vp8_row_scan[16]); -#endif extern short vp8_default_zig_zag_mask[16]; extern DECLARE_ALIGNED(64, const int, vp8_default_zig_zag1d_8x8[64]); diff --git a/vp8/common/entropymode.c b/vp8/common/entropymode.c index 5627aa43a..bcd9f3707 100644 --- a/vp8/common/entropymode.c +++ b/vp8/common/entropymode.c @@ -215,9 +215,9 @@ const vp8_tree_index vp8_uv_mode_tree[VP8_UV_MODES * 2 - 2] = { }; const vp8_tree_index vp8_mbsplit_tree[6] = { - -3, 2, - -2, 4, - -0, -1 + -PARTITIONING_4X4, 2, + -PARTITIONING_8X8, 4, + -PARTITIONING_16X8, -PARTITIONING_8X16, }; const vp8_tree_index vp8_mv_ref_tree[8] = { @@ -301,11 +301,8 @@ void vp8_init_mbmode_probs(VP8_COMMON *x) { vpx_memcpy(x->fc.sub_mv_ref_prob, vp8_sub_mv_ref_prob2, sizeof(vp8_sub_mv_ref_prob2)); vpx_memcpy(x->fc.mbsplit_prob, vp8_mbsplit_probs, sizeof(vp8_mbsplit_probs)); -#if CONFIG_SWITCHABLE_INTERP vpx_memcpy(x->fc.switchable_interp_prob, vp8_switchable_interp_prob, sizeof(vp8_switchable_interp_prob)); -#endif - } @@ -338,7 +335,6 @@ void vp8_kf_default_bmode_probs(vp8_prob p [VP8_BINTRAMODES] [VP8_BINTRAMODES] [ } while (++i < VP8_BINTRAMODES); } -#if CONFIG_SWITCHABLE_INTERP #if VP8_SWITCHABLE_FILTERS == 3 const vp8_tree_index vp8_switchable_interp_tree[VP8_SWITCHABLE_FILTERS*2-2] = { -0, 2, @@ -363,19 +359,10 @@ const vp8_prob vp8_switchable_interp_prob [VP8_SWITCHABLE_FILTERS+1] { 64}, {192}, }; -//#define SWITCHABLE_86 -#ifdef SWITCHABLE_86 -const INTERPOLATIONFILTERTYPE vp8_switchable_interp[VP8_SWITCHABLE_FILTERS] = { - EIGHTTAP, SIXTAP}; -const int vp8_switchable_interp_map[SWITCHABLE+1] = {1, -1, 0, -1, -1}; //8, 6 -#else const INTERPOLATIONFILTERTYPE vp8_switchable_interp[VP8_SWITCHABLE_FILTERS] = { EIGHTTAP, EIGHTTAP_SHARP}; const int vp8_switchable_interp_map[SWITCHABLE+1] = {-1, -1, 0, 1, -1}; //8, 8s #endif -#endif -#endif - void vp8_entropy_mode_init() { vp8_tokens_from_tree(vp8_bmode_encodings, vp8_bmode_tree); @@ -387,10 +374,8 @@ void vp8_entropy_mode_init() { vp8_tokens_from_tree(vp8_uv_mode_encodings, vp8_uv_mode_tree); vp8_tokens_from_tree(vp8_i8x8_mode_encodings, vp8_i8x8_mode_tree); vp8_tokens_from_tree(vp8_mbsplit_encodings, vp8_mbsplit_tree); -#if CONFIG_SWITCHABLE_INTERP vp8_tokens_from_tree(vp8_switchable_interp_encodings, vp8_switchable_interp_tree); -#endif vp8_tokens_from_tree_offset(vp8_mv_ref_encoding_array, vp8_mv_ref_tree, NEARESTMV); diff --git a/vp8/common/entropymode.h b/vp8/common/entropymode.h index 430c949a6..debb5659e 100644 --- a/vp8/common/entropymode.h +++ b/vp8/common/entropymode.h @@ -76,16 +76,14 @@ void vp8_kf_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES] [VP8_BINTRAMODES void vp8_adapt_mode_probs(struct VP8Common *); -#if CONFIG_SWITCHABLE_INTERP #define VP8_SWITCHABLE_FILTERS 2 /* number of switchable filters */ extern const INTERPOLATIONFILTERTYPE vp8_switchable_interp [VP8_SWITCHABLE_FILTERS]; -extern const int vp8_switchable_interp_map[SWITCHABLE+1]; +extern const int vp8_switchable_interp_map[SWITCHABLE + 1]; extern const vp8_tree_index vp8_switchable_interp_tree - [2*(VP8_SWITCHABLE_FILTERS-1)]; + [2*(VP8_SWITCHABLE_FILTERS - 1)]; extern struct vp8_token_struct vp8_switchable_interp_encodings [VP8_SWITCHABLE_FILTERS]; extern const vp8_prob vp8_switchable_interp_prob - [VP8_SWITCHABLE_FILTERS+1][VP8_SWITCHABLE_FILTERS-1]; -#endif + [VP8_SWITCHABLE_FILTERS + 1][VP8_SWITCHABLE_FILTERS - 1]; #endif diff --git a/vp8/common/entropymv.c b/vp8/common/entropymv.c index 6c31236ec..a442a2438 100644 --- a/vp8/common/entropymv.c +++ b/vp8/common/entropymv.c @@ -14,8 +14,6 @@ //#define MV_COUNT_TESTING -#if CONFIG_NEWMVENTROPY - #define MV_COUNT_SAT 16 #define MV_MAX_UPDATE_FACTOR 160 @@ -450,413 +448,13 @@ void vp8_adapt_nmv_probs(VP8_COMMON *cm, int usehp) { } } -#else /* CONFIG_NEWMVENTROPY */ - -#define MV_COUNT_SAT 16 -#define MV_MAX_UPDATE_FACTOR 128 - -const MV_CONTEXT_HP vp8_mv_update_probs_hp[2] = { - {{ - 237, - 246, - 253, 253, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, - 254, 254, 254, 254, 254, 250, 250, 252, 254, 254, 254 - } - }, - {{ - 231, - 243, - 245, 253, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, - 254, 254, 254, 254, 254, 251, 251, 254, 254, 254, 254 - } - } -}; -const MV_CONTEXT_HP vp8_default_mv_context_hp[2] = { - {{ - /* row */ - 162, /* is short */ - 128, /* sign */ - 220, 204, 180, 192, 192, 119, 192, 192, 180, 140, 192, 192, 224, 224, 224, /* short tree */ - 128, 129, 132, 75, 145, 178, 206, 239, 254, 254, 254 /* long bits */ - } - }, - {{ - /* same for column */ - 164, /* is short */ - 128, - 220, 204, 180, 192, 192, 119, 192, 192, 180, 140, 192, 192, 224, 224, 224, /* short tree */ - 128, 130, 130, 74, 148, 180, 203, 236, 254, 254, 254 /* long bits */ - } - } -}; - -const MV_CONTEXT vp8_mv_update_probs[2] = { - {{ - 237, - 246, - 253, 253, 254, 254, 254, 254, 254, - 254, 254, 254, 254, 254, 250, 250, 252, 254, 254 - } - }, - {{ - 231, - 243, - 245, 253, 254, 254, 254, 254, 254, - 254, 254, 254, 254, 254, 251, 251, 254, 254, 254 - } - } -}; -const MV_CONTEXT vp8_default_mv_context[2] = { - {{ - /* row */ - 162, /* is short */ - 128, /* sign */ - 225, 146, 172, 147, 214, 39, 156, /* short tree */ - 128, 129, 132, 75, 145, 178, 206, 239, 254, 254 /* long bits */ - } - }, - {{ - /* same for column */ - 164, /* is short */ - 128, - 204, 170, 119, 235, 140, 230, 228, - 128, 130, 130, 74, 148, 180, 203, 236, 254, 254 /* long bits */ - } - } -}; - -const vp8_tree_index vp8_small_mvtree_hp [30] = { - 2, 16, - 4, 10, - 6, 8, - -0, -1, - -2, -3, - 12, 14, - -4, -5, - -6, -7, - 18, 24, - 20, 22, - -8, -9, - -10, -11, - 26, 28, - -12, -13, - -14, -15 -}; -struct vp8_token_struct vp8_small_mvencodings_hp [16]; - -const vp8_tree_index vp8_small_mvtree [14] = { - 2, 8, - 4, 6, - -0, -1, - -2, -3, - 10, 12, - -4, -5, - -6, -7 -}; -struct vp8_token_struct vp8_small_mvencodings [8]; - -__inline static void calc_prob(vp8_prob *p, const unsigned int ct[2], int pbits) { - const unsigned int tot = ct[0] + ct[1]; - if (tot) { - const vp8_prob x = ((ct[0] * 255) / tot) & -(1 << (8 - pbits)); - *p = x ? x : 1; - } else { - *p = 128; - } -} - -static void compute_component_probs( - const unsigned int events [MVvals], - vp8_prob Pnew [MVPcount], - unsigned int is_short_ct[2], - unsigned int sign_ct[2], - unsigned int bit_ct [mvlong_width] [2], - unsigned int short_ct [mvnum_short], - unsigned int short_bct [mvnum_short - 1] [2] -) { - is_short_ct[0] = is_short_ct[1] = 0; - sign_ct[0] = sign_ct[1] = 0; - vpx_memset(bit_ct, 0, sizeof(unsigned int)*mvlong_width * 2); - vpx_memset(short_ct, 0, sizeof(unsigned int)*mvnum_short); - vpx_memset(short_bct, 0, sizeof(unsigned int) * (mvnum_short - 1) * 2); - - { - const int c = events [mv_max]; - is_short_ct [0] += c; // Short vector - short_ct [0] += c; // Magnitude distribution - } - { - int j = 1; - do { - const int c1 = events [mv_max + j]; // positive - const int c2 = events [mv_max - j]; // negative - const int c = c1 + c2; - int a = j; - - sign_ct [0] += c1; - sign_ct [1] += c2; - - if (a < mvnum_short) { - is_short_ct [0] += c; // Short vector - short_ct [a] += c; // Magnitude distribution - } else { - int k = mvlong_width - 1; - is_short_ct [1] += c; // Long vector - - do - bit_ct [k] [(a >> k) & 1] += c; - - while (--k >= 0); - } - } while (++j <= mv_max); - } - calc_prob(Pnew + mvpis_short, is_short_ct, 8); - - calc_prob(Pnew + MVPsign, sign_ct, 8); - - { - vp8_prob p [mvnum_short - 1]; /* actually only need branch ct */ - int j = 0; - - vp8_tree_probs_from_distribution( - mvnum_short, vp8_small_mvencodings, vp8_small_mvtree, - p, short_bct, short_ct, - 256, 1 - ); - - do - calc_prob(Pnew + MVPshort + j, short_bct[j], 8); - while (++j < mvnum_short - 1); - } - - { - int j = 0; - do - calc_prob(Pnew + MVPbits + j, bit_ct[j], 8); - while (++j < mvlong_width); - } -} - -static void compute_component_probs_hp( - const unsigned int events [MVvals_hp], - vp8_prob Pnew [MVPcount_hp], - unsigned int is_short_ct[2], - unsigned int sign_ct[2], - unsigned int bit_ct [mvlong_width_hp] [2], - unsigned int short_ct [mvnum_short_hp], - unsigned int short_bct [mvnum_short_hp - 1] [2] -) { - is_short_ct[0] = is_short_ct[1] = 0; - sign_ct[0] = sign_ct[1] = 0; - vpx_memset(bit_ct, 0, sizeof(unsigned int)*mvlong_width_hp * 2); - vpx_memset(short_ct, 0, sizeof(unsigned int)*mvnum_short_hp); - vpx_memset(short_bct, 0, sizeof(unsigned int) * (mvnum_short_hp - 1) * 2); - - { - const int c = events [mv_max_hp]; - is_short_ct [0] += c; // Short vector - short_ct [0] += c; // Magnitude distribution - } - { - int j = 1; - do { - const int c1 = events [mv_max_hp + j]; // positive - const int c2 = events [mv_max_hp - j]; // negative - const int c = c1 + c2; - int a = j; - - sign_ct [0] += c1; - sign_ct [1] += c2; - - if (a < mvnum_short_hp) { - is_short_ct [0] += c; // Short vector - short_ct [a] += c; // Magnitude distribution - } else { - int k = mvlong_width_hp - 1; - is_short_ct [1] += c; // Long vector - - do - bit_ct [k] [(a >> k) & 1] += c; - - while (--k >= 0); - } - } while (++j <= mv_max_hp); - } - calc_prob(Pnew + mvpis_short_hp, is_short_ct, 8); - - calc_prob(Pnew + MVPsign_hp, sign_ct, 8); - - { - vp8_prob p [mvnum_short_hp - 1]; /* actually only need branch ct */ - int j = 0; - - vp8_tree_probs_from_distribution( - mvnum_short_hp, vp8_small_mvencodings_hp, vp8_small_mvtree_hp, - p, short_bct, short_ct, - 256, 1 - ); - - do - calc_prob(Pnew + MVPshort_hp + j, short_bct[j], 8); - while (++j < mvnum_short_hp - 1); - } - - { - int j = 0; - do - calc_prob(Pnew + MVPbits_hp + j, bit_ct[j], 8); - while (++j < mvlong_width_hp); - } -} - -void vp8_adapt_mv_probs(VP8_COMMON *cm) { - int i, t, count, factor; -#ifdef MV_COUNT_TESTING - printf("static const unsigned int\nMVcount[2][MVvals]={\n"); - for (i = 0; i < 2; ++i) { - printf(" { "); - for (t = 0; t < MVvals; t++) { - printf("%d, ", cm->fc.MVcount[i][t]); - if (t % 16 == 15 && t != MVvals - 1) printf("\n "); - } - printf("},\n"); - } - printf("};\n"); - printf("static const unsigned int\nMVcount_hp[2][MVvals_hp]={\n"); - for (i = 0; i < 2; ++i) { - printf(" { "); - for (t = 0; t < MVvals_hp; t++) { - printf("%d, ", cm->fc.MVcount_hp[i][t]); - if (t % 16 == 15 && t != MVvals_hp - 1) printf("\n "); - } - printf("},\n"); - } - printf("};\n"); -#endif /* MV_COUNT_TESTING */ - - for (i = 0; i < 2; ++i) { - int prob; - unsigned int is_short_ct[2]; - unsigned int sign_ct[2]; - unsigned int bit_ct [mvlong_width] [2]; - unsigned int short_ct [mvnum_short]; - unsigned int short_bct [mvnum_short - 1] [2]; - vp8_prob Pnew [MVPcount]; - compute_component_probs(cm->fc.MVcount[i], Pnew, - is_short_ct, sign_ct, - bit_ct, short_ct, short_bct); - count = is_short_ct[0] + is_short_ct[1]; - count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count; - factor = (MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT); - prob = ((int)cm->fc.pre_mvc[i].prob[mvpis_short] * (256 - factor) + - (int)Pnew[mvpis_short] * factor + 128) >> 8; - if (prob <= 0) cm->fc.mvc[i].prob[mvpis_short] = 1; - else if (prob > 255) cm->fc.mvc[i].prob[mvpis_short] = 255; - else cm->fc.mvc[i].prob[mvpis_short] = prob; - - count = sign_ct[0] + sign_ct[1]; - count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count; - factor = (MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT); - prob = ((int)cm->fc.pre_mvc[i].prob[MVPsign] * (256 - factor) + - (int)Pnew[MVPsign] * factor + 128) >> 8; - if (prob <= 0) cm->fc.mvc[i].prob[MVPsign] = 1; - else if (prob > 255) cm->fc.mvc[i].prob[MVPsign] = 255; - else cm->fc.mvc[i].prob[MVPsign] = prob; - - for (t = 0; t < mvnum_short - 1; ++t) { - count = short_bct[t][0] + short_bct[t][1]; - count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count; - factor = (MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT); - prob = ((int)cm->fc.pre_mvc[i].prob[MVPshort + t] * (256 - factor) + - (int)Pnew[MVPshort + t] * factor + 128) >> 8; - if (prob <= 0) cm->fc.mvc[i].prob[MVPshort + t] = 1; - else if (prob > 255) cm->fc.mvc[i].prob[MVPshort + t] = 255; - else cm->fc.mvc[i].prob[MVPshort + t] = prob; - } - for (t = 0; t < mvlong_width; ++t) { - count = bit_ct[t][0] + bit_ct[t][1]; - count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count; - factor = (MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT); - prob = ((int)cm->fc.pre_mvc[i].prob[MVPbits + t] * (256 - factor) + - (int)Pnew[MVPbits + t] * factor + 128) >> 8; - if (prob <= 0) cm->fc.mvc[i].prob[MVPbits + t] = 1; - else if (prob > 255) cm->fc.mvc[i].prob[MVPbits + t] = 255; - else cm->fc.mvc[i].prob[MVPbits + t] = prob; - } - } - for (i = 0; i < 2; ++i) { - int prob; - unsigned int is_short_ct[2]; - unsigned int sign_ct[2]; - unsigned int bit_ct [mvlong_width_hp] [2]; - unsigned int short_ct [mvnum_short_hp]; - unsigned int short_bct [mvnum_short_hp - 1] [2]; - vp8_prob Pnew [MVPcount_hp]; - compute_component_probs_hp(cm->fc.MVcount_hp[i], Pnew, - is_short_ct, sign_ct, - bit_ct, short_ct, short_bct); - count = is_short_ct[0] + is_short_ct[1]; - count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count; - factor = (MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT); - prob = ((int)cm->fc.pre_mvc_hp[i].prob[mvpis_short_hp] * (256 - factor) + - (int)Pnew[mvpis_short_hp] * factor + 128) >> 8; - if (prob <= 0) cm->fc.mvc_hp[i].prob[mvpis_short_hp] = 1; - else if (prob > 255) cm->fc.mvc_hp[i].prob[mvpis_short_hp] = 255; - else cm->fc.mvc_hp[i].prob[mvpis_short_hp] = prob; - - count = sign_ct[0] + sign_ct[1]; - count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count; - factor = (MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT); - prob = ((int)cm->fc.pre_mvc_hp[i].prob[MVPsign_hp] * (256 - factor) + - (int)Pnew[MVPsign_hp] * factor + 128) >> 8; - if (prob <= 0) cm->fc.mvc_hp[i].prob[MVPsign_hp] = 1; - else if (prob > 255) cm->fc.mvc_hp[i].prob[MVPsign_hp] = 255; - else cm->fc.mvc_hp[i].prob[MVPsign_hp] = prob; - - for (t = 0; t < mvnum_short_hp - 1; ++t) { - count = short_bct[t][0] + short_bct[t][1]; - count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count; - factor = (MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT); - prob = ((int)cm->fc.pre_mvc_hp[i].prob[MVPshort_hp + t] * (256 - factor) + - (int)Pnew[MVPshort_hp + t] * factor + 128) >> 8; - if (prob <= 0) cm->fc.mvc_hp[i].prob[MVPshort_hp + t] = 1; - else if (prob > 255) cm->fc.mvc_hp[i].prob[MVPshort_hp + t] = 255; - else cm->fc.mvc_hp[i].prob[MVPshort_hp + t] = prob; - } - for (t = 0; t < mvlong_width_hp; ++t) { - count = bit_ct[t][0] + bit_ct[t][1]; - count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count; - factor = (MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT); - prob = ((int)cm->fc.pre_mvc_hp[i].prob[MVPbits_hp + t] * (256 - factor) + - (int)Pnew[MVPbits_hp + t] * factor + 128) >> 8; - if (prob <= 0) cm->fc.mvc_hp[i].prob[MVPbits_hp + t] = 1; - else if (prob > 255) cm->fc.mvc_hp[i].prob[MVPbits_hp + t] = 255; - else cm->fc.mvc_hp[i].prob[MVPbits_hp + t] = prob; - } - } -} - -#endif /* CONFIG_NEWMVENTROPY */ - void vp8_entropy_mv_init() { -#if CONFIG_NEWMVENTROPY vp8_tokens_from_tree(vp8_mv_joint_encodings, vp8_mv_joint_tree); vp8_tokens_from_tree(vp8_mv_class_encodings, vp8_mv_class_tree); vp8_tokens_from_tree(vp8_mv_class0_encodings, vp8_mv_class0_tree); vp8_tokens_from_tree(vp8_mv_fp_encodings, vp8_mv_fp_tree); -#else - vp8_tokens_from_tree(vp8_small_mvencodings, vp8_small_mvtree); - vp8_tokens_from_tree(vp8_small_mvencodings_hp, vp8_small_mvtree_hp); -#endif } void vp8_init_mv_probs(VP8_COMMON *cm) { -#if CONFIG_NEWMVENTROPY vpx_memcpy(&cm->fc.nmvc, &vp8_default_nmv_context, sizeof(nmv_context)); -#else - vpx_memcpy(cm->fc.mvc, - vp8_default_mv_context, sizeof(vp8_default_mv_context)); - vpx_memcpy(cm->fc.mvc_hp, - vp8_default_mv_context_hp, sizeof(vp8_default_mv_context_hp)); -#endif } diff --git a/vp8/common/entropymv.h b/vp8/common/entropymv.h index 1a193b172..80540a54c 100644 --- a/vp8/common/entropymv.h +++ b/vp8/common/entropymv.h @@ -22,7 +22,6 @@ void vp8_entropy_mv_init(); void vp8_init_mv_probs(struct VP8Common *cm); void vp8_adapt_mv_probs(struct VP8Common *cm); -#if CONFIG_NEWMVENTROPY void vp8_adapt_nmv_probs(struct VP8Common *cm, int usehp); void vp8_lower_mv_precision(MV *mv); int vp8_use_nmv_hp(const MV *ref); @@ -129,65 +128,4 @@ void vp8_counts_to_nmv_context( unsigned int (*branch_ct_class0_hp)[2], unsigned int (*branch_ct_hp)[2]); -#else /* CONFIG_NEWMVENTROPY */ - -enum { - mv_max = 1023, /* max absolute value of a MV component */ - MVvals = (2 * mv_max) + 1, /* # possible values "" */ - mvlong_width = 10, /* Large MVs have 9 bit magnitudes */ - mvnum_short = 8, /* magnitudes 0 through 7 */ - mvnum_short_bits = 3, /* number of bits for short mvs */ - - mvfp_max = 255, /* max absolute value of a full pixel MV component */ - MVfpvals = (2 * mvfp_max) + 1, /* # possible full pixel MV values */ - - /* probability offsets for coding each MV component */ - - mvpis_short = 0, /* short (<= 7) vs long (>= 8) */ - MVPsign, /* sign for non-zero */ - MVPshort, /* 8 short values = 7-position tree */ - - MVPbits = MVPshort + mvnum_short - 1, /* mvlong_width long value bits */ - MVPcount = MVPbits + mvlong_width /* (with independent probabilities) */ -}; - -typedef struct mv_context { - vp8_prob prob[MVPcount]; /* often come in row, col pairs */ -} MV_CONTEXT; - -extern const MV_CONTEXT vp8_mv_update_probs[2], vp8_default_mv_context[2]; - -enum { - mv_max_hp = 2047, /* max absolute value of a MV component */ - MVvals_hp = (2 * mv_max_hp) + 1, /* # possible values "" */ - mvlong_width_hp = 11, /* Large MVs have 9 bit magnitudes */ - mvnum_short_hp = 16, /* magnitudes 0 through 15 */ - mvnum_short_bits_hp = 4, /* number of bits for short mvs */ - - mvfp_max_hp = 255, /* max absolute value of a full pixel MV component */ - MVfpvals_hp = (2 * mvfp_max_hp) + 1, /* # possible full pixel MV values */ - - /* probability offsets for coding each MV component */ - - mvpis_short_hp = 0, /* short (<= 7) vs long (>= 8) */ - MVPsign_hp, /* sign for non-zero */ - MVPshort_hp, /* 8 short values = 7-position tree */ - - MVPbits_hp = MVPshort_hp + mvnum_short_hp - 1, /* mvlong_width long value bits */ - MVPcount_hp = MVPbits_hp + mvlong_width_hp /* (with independent probabilities) */ -}; - -typedef struct mv_context_hp { - vp8_prob prob[MVPcount_hp]; /* often come in row, col pairs */ -} MV_CONTEXT_HP; - -extern const MV_CONTEXT_HP vp8_mv_update_probs_hp[2], vp8_default_mv_context_hp[2]; - -extern const vp8_tree_index vp8_small_mvtree[]; -extern struct vp8_token_struct vp8_small_mvencodings [8]; -extern const vp8_tree_index vp8_small_mvtree_hp[]; -extern struct vp8_token_struct vp8_small_mvencodings_hp [16]; - -#endif /* CONFIG_NEWMVENTROPY */ - #endif diff --git a/vp8/common/findnearmv.c b/vp8/common/findnearmv.c index 7c9ea1066..5fc135090 100644 --- a/vp8/common/findnearmv.c +++ b/vp8/common/findnearmv.c @@ -10,7 +10,7 @@ #include "findnearmv.h" -#include "vp8/encoder/variance.h" +#include "vp8/common/sadmxn.h" #include <limits.h> const unsigned char vp8_mbsplit_offset[4][16] = { @@ -22,11 +22,7 @@ const unsigned char vp8_mbsplit_offset[4][16] = { static void lower_mv_precision(int_mv *mv, int usehp) { -#if CONFIG_NEWMVENTROPY if (!usehp || !vp8_use_nmv_hp(&mv->as_mv)) { -#else - if (!usehp) { -#endif if (mv->as_mv.row & 1) mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1); if (mv->as_mv.col & 1) @@ -199,6 +195,23 @@ vp8_prob *vp8_mv_ref_probs(VP8_COMMON *pc, } #if CONFIG_NEWBESTREFMV +unsigned int vp8_sad3x16_c( + const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride, + int max_sad) { + return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 3, 16); +} +unsigned int vp8_sad16x3_c( + const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride, + int max_sad) { + return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 3); +} + /* check a list of motion vectors by sad score using a number rows of pixels * above and a number cols of pixels in the left to select the one with best * score to use as ref motion vector @@ -260,10 +273,10 @@ void vp8_find_best_ref_mvs(MACROBLOCKD *xd, sad = 0; if (xd->up_available) - sad += vp8_sad16x3_c(above_src, xd->dst.y_stride, + sad += vp8_sad16x3(above_src, xd->dst.y_stride, above_ref + offset, ref_y_stride, INT_MAX); if (xd->left_available) - sad += vp8_sad3x16_c(left_src, xd->dst.y_stride, + sad += vp8_sad3x16(left_src, xd->dst.y_stride, left_ref + offset, ref_y_stride, INT_MAX); // Add the entry to our list and then resort the list on score. sad_scores[i] = sad; diff --git a/vp8/common/idct.h b/vp8/common/idct.h index d096e8182..ae33df668 100644 --- a/vp8/common/idct.h +++ b/vp8/common/idct.h @@ -109,12 +109,9 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_lossless_c); extern prototype_second_order(vp8_short_inv_walsh4x4_1_lossless_c); #endif -#if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16 #include "vp8/common/blockd.h" void vp8_ihtllm_c(short *input, short *output, int pitch, TX_TYPE tx_type, int tx_dim); -#endif - typedef prototype_idct((*vp8_idct_fn_t)); typedef prototype_idct_scalar_add((*vp8_idct_scalar_add_fn_t)); diff --git a/vp8/common/idctllm.c b/vp8/common/idctllm.c index d705fec32..c7369b2e2 100644 --- a/vp8/common/idctllm.c +++ b/vp8/common/idctllm.c @@ -26,9 +26,7 @@ #include "vp8/common/idct.h" #include "vp8/common/systemdependent.h" -#if CONFIG_HYBRIDTRANSFORM #include "vp8/common/blockd.h" -#endif #include <math.h> @@ -38,7 +36,6 @@ static const int rounding = 0; // TODO: these transforms can be further converted into integer forms // for complexity optimization -#if CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM16X16 float idct_4[16] = { 0.500000000000000, 0.653281482438188, 0.500000000000000, 0.270598050073099, 0.500000000000000, 0.270598050073099, -0.500000000000000, -0.653281482438188, @@ -90,9 +87,7 @@ float iadst_8[64] = { 0.483002021635509, -0.466553967085785, 0.434217976756762, -0.387095214016348, 0.326790388032145, -0.255357107325375, 0.175227946595736, -0.089131608307532 }; -#endif -#if CONFIG_HYBRIDTRANSFORM16X16 || CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM8X8 float idct_16[256] = { 0.250000, 0.351851, 0.346760, 0.338330, 0.326641, 0.311806, 0.293969, 0.273300, 0.250000, 0.224292, 0.196424, 0.166664, 0.135299, 0.102631, 0.068975, 0.034654, @@ -162,9 +157,7 @@ float iadst_16[256] = { 0.347761, -0.344612, 0.338341, -0.329007, 0.316693, -0.301511, 0.283599, -0.263118, 0.240255, -0.215215, 0.188227, -0.159534, 0.129396, -0.098087, 0.065889, -0.033094 }; -#endif -#if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16 void vp8_ihtllm_c(short *input, short *output, int pitch, TX_TYPE tx_type, int tx_dim) { @@ -289,7 +282,6 @@ void vp8_ihtllm_c(short *input, short *output, int pitch, } vp8_clear_system_state(); // Make it simd safe : __asm emms; } -#endif void vp8_short_idct4x4llm_c(short *input, short *output, int pitch) { int i; diff --git a/vp8/common/loopfilter_filters.c b/vp8/common/loopfilter_filters.c index 3f97d2101..323d48de8 100644 --- a/vp8/common/loopfilter_filters.c +++ b/vp8/common/loopfilter_filters.c @@ -7,8 +7,6 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ - - #include <stdlib.h> #include "vpx_config.h" #include "loopfilter.h" @@ -94,6 +92,7 @@ static __inline void vp8_filter(signed char mask, uc hev, uc *op1, *op1 = u ^ 0x80; } + void vp8_loop_filter_horizontal_edge_c ( unsigned char *s, @@ -218,6 +217,7 @@ static __inline void vp8_mbfilter(signed char mask, uc hev, uc flat, Filter2 = vp8_signed_char_clamp(vp8_filter + 3); Filter1 >>= 3; Filter2 >>= 3; + u = vp8_signed_char_clamp(qs0 - Filter1); *oq0 = u ^ 0x80; u = vp8_signed_char_clamp(ps0 + Filter2); @@ -271,8 +271,6 @@ void vp8_mbloop_filter_horizontal_edge_c } while (++i < count * 8); } - - void vp8_mbloop_filter_vertical_edge_c ( unsigned char *s, diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h index 0396a7087..38df3500a 100644 --- a/vp8/common/onyxc_int.h +++ b/vp8/common/onyxc_int.h @@ -51,27 +51,14 @@ typedef struct frame_contexts { vp8_prob sub_mv_ref_prob [SUBMVREF_COUNT][VP8_SUBMVREFS - 1]; vp8_prob mbsplit_prob [VP8_NUMMBSPLITS - 1]; vp8_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; -#if CONFIG_HYBRIDTRANSFORM vp8_prob hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; -#endif vp8_prob coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; -#if CONFIG_HYBRIDTRANSFORM8X8 vp8_prob hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; -#endif vp8_prob coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; -#if CONFIG_HYBRIDTRANSFORM16X16 vp8_prob hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; -#endif -#if CONFIG_NEWMVENTROPY nmv_context nmvc; nmv_context pre_nmvc; -#else - MV_CONTEXT mvc[2]; - MV_CONTEXT_HP mvc_hp[2]; - MV_CONTEXT pre_mvc[2]; - MV_CONTEXT_HP pre_mvc_hp[2]; -#endif vp8_prob pre_bmode_prob [VP8_BINTRAMODES - 1]; vp8_prob pre_ymode_prob [VP8_YMODES - 1]; /* interframe intra mode probs */ vp8_prob pre_uv_mode_prob [VP8_YMODES][VP8_UV_MODES - 1]; @@ -87,56 +74,37 @@ typedef struct frame_contexts { vp8_prob pre_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; -#if CONFIG_HYBRIDTRANSFORM vp8_prob pre_hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; -#endif vp8_prob pre_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; -#if CONFIG_HYBRIDTRANSFORM8X8 vp8_prob pre_hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; -#endif vp8_prob pre_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; -#if CONFIG_HYBRIDTRANSFORM16X16 vp8_prob pre_hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; -#endif unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; -#if CONFIG_HYBRIDTRANSFORM unsigned int hybrid_coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; -#endif unsigned int coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; -#if CONFIG_HYBRIDTRANSFORM8X8 unsigned int hybrid_coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; -#endif unsigned int coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; -#if CONFIG_HYBRIDTRANSFORM16X16 unsigned int hybrid_coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; -#endif -#if CONFIG_NEWMVENTROPY nmv_context_counts NMVcount; -#else - unsigned int MVcount [2] [MVvals]; - unsigned int MVcount_hp [2] [MVvals_hp]; -#endif -#if CONFIG_SWITCHABLE_INTERP - vp8_prob switchable_interp_prob[VP8_SWITCHABLE_FILTERS+1] - [VP8_SWITCHABLE_FILTERS-1]; -#endif + vp8_prob switchable_interp_prob[VP8_SWITCHABLE_FILTERS + 1] + [VP8_SWITCHABLE_FILTERS - 1]; int mode_context[6][4]; int mode_context_a[6][4]; @@ -161,10 +129,8 @@ typedef enum { ONLY_4X4 = 0, ALLOW_8X8 = 1, ALLOW_16X16 = 2, -#if CONFIG_TX_SELECT TX_MODE_SELECT = 3, -#endif - NB_TXFM_MODES = 3 + CONFIG_TX_SELECT, + NB_TXFM_MODES = 4, } TXFM_MODE; typedef struct VP8_COMMON_RTCD { @@ -302,10 +268,8 @@ typedef struct VP8Common { vp8_prob prob_comppred[COMP_PRED_CONTEXTS]; -#if CONFIG_TX_SELECT // FIXME contextualize vp8_prob prob_tx[TX_SIZE_MAX - 1]; -#endif vp8_prob mbskip_pred_probs[MBSKIP_CONTEXTS]; diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c index 17bbe3281..388612e8a 100644 --- a/vp8/common/postproc.c +++ b/vp8/common/postproc.c @@ -783,7 +783,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t if (mi->mbmi.mode == SPLITMV) { switch (mi->mbmi.partitioning) { - case 0 : { /* mv_top_bottom */ + case PARTITIONING_16X8 : { /* mv_top_bottom */ union b_mode_info *bmi = &mi->bmi[0]; MV *mv = &bmi->mv.as_mv; @@ -803,7 +803,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t break; } - case 1 : { /* mv_left_right */ + case PARTITIONING_8X16 : { /* mv_left_right */ union b_mode_info *bmi = &mi->bmi[0]; MV *mv = &bmi->mv.as_mv; @@ -823,7 +823,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t break; } - case 2 : { /* mv_quarters */ + case PARTITIONING_8X8 : { /* mv_quarters */ union b_mode_info *bmi = &mi->bmi[0]; MV *mv = &bmi->mv.as_mv; @@ -858,6 +858,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t vp8_blit_line(x0 + 12, x1, y0 + 12, y1, y_buffer, y_stride); break; } + case PARTITIONING_4X4: default : { union b_mode_info *bmi = mi->bmi; int bx0, by0; diff --git a/vp8/common/pred_common.c b/vp8/common/pred_common.c index a32389433..a97eed8e4 100644 --- a/vp8/common/pred_common.c +++ b/vp8/common/pred_common.c @@ -63,7 +63,6 @@ unsigned char get_pred_context(const VP8_COMMON *const cm, (m - cm->mode_info_stride)->mbmi.mb_skip_coeff; break; -#if CONFIG_SWITCHABLE_INTERP case PRED_SWITCHABLE_INTERP: { int left_in_image = (m - 1)->mbmi.mb_in_image; @@ -93,7 +92,6 @@ unsigned char get_pred_context(const VP8_COMMON *const cm, pred_context = VP8_SWITCHABLE_FILTERS; } break; -#endif default: // TODO *** add error trap code. @@ -175,11 +173,10 @@ const vp8_prob *get_pred_probs(const VP8_COMMON *const cm, pred_probability = &cm->mbskip_pred_probs[pred_context]; break; -#if CONFIG_SWITCHABLE_INTERP case PRED_SWITCHABLE_INTERP: pred_probability = &cm->fc.switchable_interp_prob[pred_context][0]; break; -#endif + default: // TODO *** add error trap code. pred_probability = NULL; diff --git a/vp8/common/pred_common.h b/vp8/common/pred_common.h index 402e0235f..2a9875dfe 100644 --- a/vp8/common/pred_common.h +++ b/vp8/common/pred_common.h @@ -22,12 +22,9 @@ typedef enum { PRED_REF = 1, PRED_COMP = 2, PRED_MBSKIP = 3, -#if CONFIG_SWITCHABLE_INTERP - PRED_SWITCHABLE_INTERP = 4, -#endif + PRED_SWITCHABLE_INTERP = 4 } PRED_ID; - extern unsigned char get_pred_context(const VP8_COMMON *const cm, const MACROBLOCKD *const xd, PRED_ID pred_id); diff --git a/vp8/common/reconinter.c b/vp8/common/reconinter.c index a41d233ab..6c60845fb 100644 --- a/vp8/common/reconinter.c +++ b/vp8/common/reconinter.c @@ -36,13 +36,7 @@ void vp8_setup_interp_filters(MACROBLOCKD *xd, &cm->rtcd.subpix, sixtap_avg8x8); xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE( &cm->rtcd.subpix, sixtap_avg16x16); - } - else if (mcomp_filter_type == EIGHTTAP -#if CONFIG_SWITCHABLE_INTERP - || - mcomp_filter_type == SWITCHABLE -#endif - ) { + } else if (mcomp_filter_type == EIGHTTAP || mcomp_filter_type == SWITCHABLE) { xd->subpixel_predict = SUBPIX_INVOKE( &cm->rtcd.subpix, eighttap4x4); xd->subpixel_predict8x4 = SUBPIX_INVOKE( @@ -965,7 +959,7 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) { MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi; BLOCKD *blockd = xd->block; - if (xd->mode_info_context->mbmi.partitioning < 3) { + if (xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4) { blockd[ 0].bmi = xd->mode_info_context->bmi[ 0]; blockd[ 2].bmi = xd->mode_info_context->bmi[ 2]; blockd[ 8].bmi = xd->mode_info_context->bmi[ 8]; diff --git a/vp8/common/rtcd_defs.sh b/vp8/common/rtcd_defs.sh index ef272df90..ea64c9682 100644 --- a/vp8/common/rtcd_defs.sh +++ b/vp8/common/rtcd_defs.sh @@ -125,22 +125,22 @@ specialize vp8_comp_intra_uv4x4_predict; # Loopfilter # prototype void vp8_loop_filter_mbv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi" -specialize vp8_loop_filter_mbv; +specialize vp8_loop_filter_mbv sse2 prototype void vp8_loop_filter_bv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi" -specialize vp8_loop_filter_bv; +specialize vp8_loop_filter_bv sse2 prototype void vp8_loop_filter_bv8x8 "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi" -specialize vp8_loop_filter_bv8x8; +specialize vp8_loop_filter_bv8x8 sse2 prototype void vp8_loop_filter_mbh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi" -specialize vp8_loop_filter_mbh; +specialize vp8_loop_filter_mbh sse2 prototype void vp8_loop_filter_bh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi" -specialize vp8_loop_filter_bh; +specialize vp8_loop_filter_bh sse2 prototype void vp8_loop_filter_bh8x8 "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi" -specialize vp8_loop_filter_bh8x8; +specialize vp8_loop_filter_bh8x8 sse2 prototype void vp8_loop_filter_simple_mbv "unsigned char *y, int ystride, const unsigned char *blimit" specialize vp8_loop_filter_simple_mbv mmx sse2 media neon @@ -174,3 +174,210 @@ vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2 vp8_loop_filter_simple_bh_media=vp8_loop_filter_bhs_armv6 vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon +# +# sad 16x3, 3x16 +# +prototype unsigned int vp8_sad16x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad" +specialize vp8_sad16x3 + +prototype unsigned int vp8_sad3x16 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad" +specialize vp8_sad3x16 + +# +# Encoder functions below this point. +# +if [ "$CONFIG_VP8_ENCODER" = "yes" ]; then + + +# variance +[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 + +prototype unsigned int vp8_variance32x32 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp8_variance32x32 + +prototype unsigned int vp8_variance16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp8_variance16x16 mmx sse2 +vp8_variance16x16_sse2=vp8_variance16x16_wmt +vp8_variance16x16_mmx=vp8_variance16x16_mmx + +prototype unsigned int vp8_variance16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp8_variance16x8 mmx sse2 +vp8_variance16x8_sse2=vp8_variance16x8_wmt +vp8_variance16x8_mmx=vp8_variance16x8_mmx + +prototype unsigned int vp8_variance8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp8_variance8x16 mmx sse2 +vp8_variance8x16_sse2=vp8_variance8x16_wmt +vp8_variance8x16_mmx=vp8_variance8x16_mmx + +prototype unsigned int vp8_variance8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp8_variance8x8 mmx sse2 +vp8_variance8x8_sse2=vp8_variance8x8_wmt +vp8_variance8x8_mmx=vp8_variance8x8_mmx + +prototype unsigned int vp8_variance4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp8_variance4x4 mmx sse2 +vp8_variance4x4_sse2=vp8_variance4x4_wmt +vp8_variance4x4_mmx=vp8_variance4x4_mmx + +prototype unsigned int vp8_sub_pixel_variance32x32 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse" +specialize vp8_sub_pixel_variance32x32 + +prototype unsigned int vp8_sub_pixel_variance16x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse" +specialize vp8_sub_pixel_variance16x16 sse2 mmx ssse3 +vp8_sub_pixel_variance16x16_sse2=vp8_sub_pixel_variance16x16_wmt + +prototype unsigned int vp8_sub_pixel_variance8x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse" +specialize vp8_sub_pixel_variance8x16 sse2 mmx +vp8_sub_pixel_variance8x16_sse2=vp8_sub_pixel_variance8x16_wmt + +prototype unsigned int vp8_sub_pixel_variance16x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse" +specialize vp8_sub_pixel_variance16x8 sse2 mmx ssse3 +vp8_sub_pixel_variance16x8_sse2=vp8_sub_pixel_variance16x8_ssse3; +vp8_sub_pixel_variance16x8_sse2=vp8_sub_pixel_variance16x8_wmt + +prototype unsigned int vp8_sub_pixel_variance8x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse" +specialize vp8_sub_pixel_variance8x8 sse2 mmx +vp8_sub_pixel_variance8x8_sse2=vp8_sub_pixel_variance8x8_wmt + +prototype unsigned int vp8_sub_pixel_variance4x4 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse" +specialize vp8_sub_pixel_variance4x4 sse2 mmx +vp8_sub_pixel_variance4x4_sse2=vp8_sub_pixel_variance4x4_wmt + +prototype unsigned int vp8_sad32x32 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad" +specialize vp8_sad32x32 + +prototype unsigned int vp8_sad16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad" +specialize vp8_sad16x16 mmx sse2 sse3 +vp8_sad16x16_sse2=vp8_sad16x16_wmt + +prototype unsigned int vp8_sad16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad" +specialize vp8_sad16x8 mmx sse2 +vp8_sad16x8_sse2=vp8_sad16x8_wmt + +prototype unsigned int vp8_sad8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad" +specialize vp8_sad8x16 mmx sse2 +vp8_sad8x16_sse2=vp8_sad8x16_wmt + +prototype unsigned int vp8_sad8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad" +specialize vp8_sad8x8 mmx sse2 +vp8_sad8x8_sse2=vp8_sad8x8_wmt + +prototype unsigned int vp8_sad4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad" +specialize vp8_sad4x4 mmx sse2 +vp8_sad4x4_sse2=vp8_sad4x4_wmt + +prototype unsigned int vp8_variance_halfpixvar16x16_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp8_variance_halfpixvar16x16_h mmx sse2 +vp8_variance_halfpixvar16x16_h_sse2=vp8_variance_halfpixvar16x16_h_wmt + +prototype unsigned int vp8_variance_halfpixvar16x16_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp8_variance_halfpixvar16x16_v mmx sse2 +vp8_variance_halfpixvar16x16_v_sse2=vp8_variance_halfpixvar16x16_v_wmt + +prototype unsigned int vp8_variance_halfpixvar16x16_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp8_variance_halfpixvar16x16_hv mmx sse2 +vp8_variance_halfpixvar16x16_hv_sse2=vp8_variance_halfpixvar16x16_hv_wmt + +prototype unsigned int vp8_variance_halfpixvar32x32_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp8_variance_halfpixvar32x32_h + +prototype unsigned int vp8_variance_halfpixvar32x32_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp8_variance_halfpixvar32x32_v + +prototype unsigned int vp8_variance_halfpixvar32x32_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp8_variance_halfpixvar32x32_hv + +prototype void vp8_sad32x32x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array" +specialize vp8_sad32x32x3 + +prototype void vp8_sad16x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array" +specialize vp8_sad16x16x3 sse3 ssse3 + +prototype void vp8_sad16x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array" +specialize vp8_sad16x8x3 sse3 ssse3 + +prototype void vp8_sad8x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array" +specialize vp8_sad8x16x3 sse3 + +prototype void vp8_sad8x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array" +specialize vp8_sad8x8x3 sse3 + +prototype void vp8_sad4x4x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array" +specialize vp8_sad4x4x3 sse3 + +prototype void vp8_sad32x32x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array" +specialize vp8_sad32x32x8 + +prototype void vp8_sad16x16x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array" +specialize vp8_sad16x16x8 sse4 + +prototype void vp8_sad16x8x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array" +specialize vp8_sad16x8x8 sse4 + +prototype void vp8_sad8x16x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array" +specialize vp8_sad8x16x8 sse4 + +prototype void vp8_sad8x8x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array" +specialize vp8_sad8x8x8 sse4 + +prototype void vp8_sad4x4x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array" +specialize vp8_sad4x4x8 sse4 + +prototype void vp8_sad32x32x4d "const unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp8_sad32x32x4d + +prototype void vp8_sad16x16x4d "const unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp8_sad16x16x4d sse3 + +prototype void vp8_sad16x8x4d "const unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp8_sad16x8x4d sse3 + +prototype void vp8_sad8x16x4d "const unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp8_sad8x16x4d sse3 + +prototype void vp8_sad8x8x4d "const unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp8_sad8x8x4d sse3 + +prototype void vp8_sad4x4x4d "const unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp8_sad4x4x4d sse3 + +# +# Block copy +# +case $arch in + x86*) + prototype void vp8_copy32xn "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n" + specialize vp8_copy32xn sse2 sse3 + ;; +esac + +prototype unsigned int vp8_sub_pixel_mse16x16 "const unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse" +specialize vp8_sub_pixel_mse16x16 sse2 mmx +vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt + +prototype unsigned int vp8_mse16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, unsigned int *sse" +specialize vp8_mse16x16 mmx sse2 +vp8_mse16x16_sse2=vp8_mse16x16_wmt + +prototype unsigned int vp8_sub_pixel_mse32x32 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse" +specialize vp8_sub_pixel_mse32x32 + +prototype unsigned int vp8_get_mb_ss "const short *" +specialize vp8_get_mb_ss mmx sse2 + +# +# Structured Similarity (SSIM) +# +if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then + [ $arch = "x86_64" ] && sse2_on_x86_64=sse2 + + prototype void vp8_ssim_parms_8x8 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr" + specialize vp8_ssim_parms_8x8 $sse2_on_x86_64 + + prototype void vp8_ssim_parms_16x16 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr" + specialize vp8_ssim_parms_16x16 $sse2_on_x86_64 +fi + +fi +# end encoder functions diff --git a/vp8/common/sadmxn.h b/vp8/common/sadmxn.h new file mode 100644 index 000000000..47b8dfc58 --- /dev/null +++ b/vp8/common/sadmxn.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef __INC_SAD_H +#define __INC_SAD_H + +static __inline +unsigned int sad_mx_n_c( + const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride, + int m, + int n) { + int r, c; + unsigned int sad = 0; + + for (r = 0; r < n; r++) { + for (c = 0; c < m; c++) { + sad += abs(src_ptr[c] - ref_ptr[c]); + } + + src_ptr += src_stride; + ref_ptr += ref_stride; + } + + return sad; +} + +#endif diff --git a/vp8/common/x86/loopfilter_mmx.asm b/vp8/common/x86/loopfilter_mmx.asm index 697a5dee6..63b72385b 100644 --- a/vp8/common/x86/loopfilter_mmx.asm +++ b/vp8/common/x86/loopfilter_mmx.asm @@ -594,790 +594,6 @@ sym(vp8_loop_filter_vertical_edge_mmx): ret -;void vp8_mbloop_filter_horizontal_edge_mmx -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; int count -;) -global sym(vp8_mbloop_filter_horizontal_edge_mmx) -sym(vp8_mbloop_filter_horizontal_edge_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 32 ; reserve 32 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - - movsxd rcx, dword ptr arg(5) ;count -.next8_mbh: - mov rdx, arg(3) ;limit - movq mm7, [rdx] - mov rdi, rsi ; rdi points to row +1 for indirect addressing - add rdi, rax - - ; calculate breakout conditions - movq mm2, [rdi+2*rax] ; q3 - - movq mm1, [rsi+2*rax] ; q2 - movq mm6, mm1 ; q2 - psubusb mm1, mm2 ; q2-=q3 - psubusb mm2, mm6 ; q3-=q2 - por mm1, mm2 ; abs(q3-q2) - psubusb mm1, mm7 - - - ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit - movq mm4, [rsi+rax] ; q1 - movq mm3, mm4 ; q1 - psubusb mm4, mm6 ; q1-=q2 - psubusb mm6, mm3 ; q2-=q1 - por mm4, mm6 ; abs(q2-q1) - psubusb mm4, mm7 - por mm1, mm4 - - - ; mm1 = mask, mm3=q1, mm7 = limit - - movq mm4, [rsi] ; q0 - movq mm0, mm4 ; q0 - psubusb mm4, mm3 ; q0-=q1 - psubusb mm3, mm0 ; q1-=q0 - por mm4, mm3 ; abs(q0-q1) - movq t0, mm4 ; save to t0 - psubusb mm4, mm7 - por mm1, mm4 - - - ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) - - neg rax ; negate pitch to deal with above border - - movq mm2, [rsi+4*rax] ; p3 - movq mm4, [rdi+4*rax] ; p2 - movq mm5, mm4 ; p2 - psubusb mm4, mm2 ; p2-=p3 - psubusb mm2, mm5 ; p3-=p2 - por mm4, mm2 ; abs(p3 - p2) - psubusb mm4, mm7 - por mm1, mm4 - ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) - - movq mm4, [rsi+2*rax] ; p1 - movq mm3, mm4 ; p1 - psubusb mm4, mm5 ; p1-=p2 - psubusb mm5, mm3 ; p2-=p1 - por mm4, mm5 ; abs(p2 - p1) - psubusb mm4, mm7 - por mm1, mm4 - - movq mm2, mm3 ; p1 - - - ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) - - movq mm4, [rsi+rax] ; p0 - movq mm5, mm4 ; p0 - psubusb mm4, mm3 ; p0-=p1 - psubusb mm3, mm5 ; p1-=p0 - por mm4, mm3 ; abs(p1 - p0) - movq t1, mm4 ; save to t1 - psubusb mm4, mm7 - por mm1, mm4 - ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0) - ; mm5 = p0 - movq mm3, [rdi] ; q1 - movq mm4, mm3 ; q1 - psubusb mm3, mm2 ; q1-=p1 - psubusb mm2, mm4 ; p1-=q1 - por mm2, mm3 ; abs(p1-q1) - pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw mm2, 1 ; abs(p1-q1)/2 - - movq mm6, mm5 ; p0 - movq mm3, mm0 ; q0 - psubusb mm5, mm3 ; p0-=q0 - psubusb mm3, mm6 ; q0-=p0 - por mm5, mm3 ; abs(p0 - q0) - paddusb mm5, mm5 ; abs(p0-q0)*2 - paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - mov rdx, arg(2) ;blimit ; get blimit - movq mm7, [rdx] ; blimit - - psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit - por mm1, mm5 - pxor mm5, mm5 - pcmpeqb mm1, mm5 ; mask mm1 - - ; mm1 = mask, mm0=q0, mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0) - ; mm6 = p0, - - ; calculate high edge variance - mov rdx, arg(4) ;thresh ; get thresh - movq mm7, [rdx] ; - movq mm4, t0 ; get abs (q1 - q0) - psubusb mm4, mm7 - movq mm3, t1 ; get abs (p1 - p0) - psubusb mm3, mm7 - paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh - - pcmpeqb mm4, mm5 - - pcmpeqb mm5, mm5 - pxor mm4, mm5 - - - - ; mm1 = mask, mm0=q0, mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0) - ; mm6 = p0, mm4=hev - ; start work on filters - movq mm2, [rsi+2*rax] ; p1 - movq mm7, [rdi] ; q1 - pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values - psubsb mm2, mm7 ; p1 - q1 - - pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values - pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values - movq mm3, mm0 ; q0 - psubsb mm0, mm6 ; q0 - p0 - paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1) - paddsb mm2, mm0 ; 2 * (q0 - p0) - paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1) - pand mm1, mm2 ; mask filter values we don't care about - - - ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0 - movq mm2, mm1 ; vp8_filter - pand mm2, mm4; ; Filter2 = vp8_filter & hev - - movq mm5, mm2 ; - paddsb mm5, [GLOBAL(t3)]; - - pxor mm0, mm0 ; 0 - pxor mm7, mm7 ; 0 - - punpcklbw mm0, mm5 ; e0f0g0h0 - psraw mm0, 11 ; sign extended shift right by 3 - punpckhbw mm7, mm5 ; a0b0c0d0 - psraw mm7, 11 ; sign extended shift right by 3 - packsswb mm0, mm7 ; Filter2 >>=3; - - movq mm5, mm0 ; Filter2 - - paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) - pxor mm0, mm0 ; 0 - pxor mm7, mm7 ; 0 - - punpcklbw mm0, mm2 ; e0f0g0h0 - psraw mm0, 11 ; sign extended shift right by 3 - punpckhbw mm7, mm2 ; a0b0c0d0 - psraw mm7, 11 ; sign extended shift right by 3 - packsswb mm0, mm7 ; Filter2 >>=3; - - ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0 - psubsb mm3, mm0 ; qs0 =qs0 - filter1 - paddsb mm6, mm5 ; ps0 =ps0 + Fitler2 - - ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0 - ; vp8_filter &= ~hev; - ; Filter2 = vp8_filter; - pandn mm4, mm1 ; vp8_filter&=~hev - - - ; mm3=qs0, mm4=filter2, mm6=ps0 - - ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); - ; s = vp8_signed_char_clamp(qs0 - u); - ; *oq0 = s^0x80; - ; s = vp8_signed_char_clamp(ps0 + u); - ; *op0 = s^0x80; - pxor mm0, mm0 - - pxor mm1, mm1 - pxor mm2, mm2 - punpcklbw mm1, mm4 - punpckhbw mm2, mm4 - pmulhw mm1, [GLOBAL(s27)] - pmulhw mm2, [GLOBAL(s27)] - paddw mm1, [GLOBAL(s63)] - paddw mm2, [GLOBAL(s63)] - psraw mm1, 7 - psraw mm2, 7 - packsswb mm1, mm2 - - psubsb mm3, mm1 - paddsb mm6, mm1 - - pxor mm3, [GLOBAL(t80)] - pxor mm6, [GLOBAL(t80)] - movq [rsi+rax], mm6 - movq [rsi], mm3 - - ; roughly 2/7th difference across boundary - ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); - ; s = vp8_signed_char_clamp(qs1 - u); - ; *oq1 = s^0x80; - ; s = vp8_signed_char_clamp(ps1 + u); - ; *op1 = s^0x80; - pxor mm1, mm1 - pxor mm2, mm2 - punpcklbw mm1, mm4 - punpckhbw mm2, mm4 - pmulhw mm1, [GLOBAL(s18)] - pmulhw mm2, [GLOBAL(s18)] - paddw mm1, [GLOBAL(s63)] - paddw mm2, [GLOBAL(s63)] - psraw mm1, 7 - psraw mm2, 7 - packsswb mm1, mm2 - - movq mm3, [rdi] - movq mm6, [rsi+rax*2] ; p1 - - pxor mm3, [GLOBAL(t80)] - pxor mm6, [GLOBAL(t80)] - - paddsb mm6, mm1 - psubsb mm3, mm1 - - pxor mm6, [GLOBAL(t80)] - pxor mm3, [GLOBAL(t80)] - movq [rdi], mm3 - movq [rsi+rax*2], mm6 - - ; roughly 1/7th difference across boundary - ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); - ; s = vp8_signed_char_clamp(qs2 - u); - ; *oq2 = s^0x80; - ; s = vp8_signed_char_clamp(ps2 + u); - ; *op2 = s^0x80; - pxor mm1, mm1 - pxor mm2, mm2 - punpcklbw mm1, mm4 - punpckhbw mm2, mm4 - pmulhw mm1, [GLOBAL(s9)] - pmulhw mm2, [GLOBAL(s9)] - paddw mm1, [GLOBAL(s63)] - paddw mm2, [GLOBAL(s63)] - psraw mm1, 7 - psraw mm2, 7 - packsswb mm1, mm2 - - - movq mm6, [rdi+rax*4] - neg rax - movq mm3, [rdi+rax ] - - pxor mm6, [GLOBAL(t80)] - pxor mm3, [GLOBAL(t80)] - - paddsb mm6, mm1 - psubsb mm3, mm1 - - pxor mm6, [GLOBAL(t80)] - pxor mm3, [GLOBAL(t80)] - movq [rdi+rax ], mm3 - neg rax - movq [rdi+rax*4], mm6 - -;EARLY_BREAK_OUT: - neg rax - add rsi,8 - dec rcx - jnz .next8_mbh - - add rsp, 32 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_mbloop_filter_vertical_edge_mmx -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; int count -;) -global sym(vp8_mbloop_filter_vertical_edge_mmx) -sym(vp8_mbloop_filter_vertical_edge_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 96 ; reserve 96 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; - %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - - lea rsi, [rsi + rax*4 - 4] - - movsxd rcx, dword ptr arg(5) ;count -.next8_mbv: - lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing - - ;transpose - movq mm0, [rdi+2*rax] ; 77 76 75 74 73 72 71 70 - movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60 - - movq mm7, mm6 ; 77 76 75 74 73 72 71 70 - punpckhbw mm7, mm0 ; 77 67 76 66 75 65 74 64 - - punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60 - movq mm0, [rsi+rax] ; 57 56 55 54 53 52 51 50 - - movq mm4, [rsi] ; 47 46 45 44 43 42 41 40 - movq mm5, mm4 ; 47 46 45 44 43 42 41 40 - - punpckhbw mm5, mm0 ; 57 47 56 46 55 45 54 44 - punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40 - - movq mm3, mm5 ; 57 47 56 46 55 45 54 44 - punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46 - - punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44 - movq mm2, mm4 ; 53 43 52 42 51 41 50 40 - - punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42 - punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40 - - neg rax - - movq mm7, [rsi+rax] ; 37 36 35 34 33 32 31 30 - movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20 - - movq mm1, mm6 ; 27 26 25 24 23 22 21 20 - punpckhbw mm6, mm7 ; 37 27 36 36 35 25 34 24 - - punpcklbw mm1, mm7 ; 33 23 32 22 31 21 30 20 - - movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00 - punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04 - - movq mm0, mm7 ; 17 07 16 06 15 05 14 04 - punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06 - - punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04 - movq mm6, mm7 ; 37 27 17 07 36 26 16 06 - - punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3 - punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2 - - lea rdx, srct - movq mm5, mm6 ; 76 66 56 46 36 26 16 06 - - movq [rdx+56], mm7 - psubusb mm5, mm7 ; q2-q3 - - - movq [rdx+48], mm6 - psubusb mm7, mm6 ; q3-q2 - - por mm7, mm5; ; mm7=abs (q3-q2) - movq mm5, mm0 ; 35 25 15 05 34 24 14 04 - - punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1 - punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0 - - movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1 - psubusb mm3, mm6 ; q1-q2 - - psubusb mm6, mm5 ; q2-q1 - por mm6, mm3 ; mm6=abs(q2-q1) - - movq [rdx+40], mm5 ; save q1 - movq [rdx+32], mm0 ; save q0 - - movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00 - punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00 - - movq mm0, mm3 ; 13 03 12 02 11 01 10 00 - punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00 - - punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02 - movq mm1, mm0 ; 31 21 11 01 30 20 10 00 - - punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3 - punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2 - - movq [rdx], mm0 ; save p3 - movq [rdx+8], mm1 ; save p2 - - movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2 - psubusb mm2, mm0 ; p2-p3 - - psubusb mm0, mm1 ; p3-p2 - por mm0, mm2 ; mm0=abs(p3-p2) - - movq mm2, mm3 ; 33 23 13 03 32 22 12 02 - punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1 - - punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0 - movq [rdx+24], mm3 ; save p0 - - movq [rdx+16], mm2 ; save p1 - movq mm5, mm2 ; mm5 = p1 - - psubusb mm2, mm1 ; p1-p2 - psubusb mm1, mm5 ; p2-p1 - - por mm1, mm2 ; mm1=abs(p2-p1) - mov rdx, arg(3) ;limit - - movq mm4, [rdx] ; mm4 = limit - psubusb mm7, mm4 ; abs(q3-q2) > limit - - psubusb mm0, mm4 ; abs(p3-p2) > limit - psubusb mm1, mm4 ; abs(p2-p1) > limit - - psubusb mm6, mm4 ; abs(q2-q1) > limit - por mm7, mm6 ; or - - por mm0, mm1 ; - por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit - - movq mm1, mm5 ; p1 - - movq mm7, mm3 ; mm3=mm7=p0 - psubusb mm7, mm5 ; p0 - p1 - - psubusb mm5, mm3 ; p1 - p0 - por mm5, mm7 ; abs(p1-p0) - - movq t0, mm5 ; save abs(p1-p0) - lea rdx, srct - - psubusb mm5, mm4 ; mm5 = abs(p1-p0) > limit - por mm0, mm5 ; mm0=mask - - movq mm5, [rdx+32] ; mm5=q0 - movq mm7, [rdx+40] ; mm7=q1 - - movq mm6, mm5 ; mm6=q0 - movq mm2, mm7 ; q1 - psubusb mm5, mm7 ; q0-q1 - - psubusb mm7, mm6 ; q1-q0 - por mm7, mm5 ; abs(q1-q0) - - movq t1, mm7 ; save abs(q1-q0) - psubusb mm7, mm4 ; mm7=abs(q1-q0)> limit - - por mm0, mm7 ; mask - - movq mm5, mm2 ; q1 - psubusb mm5, mm1 ; q1-=p1 - psubusb mm1, mm2 ; p1-=q1 - por mm5, mm1 ; abs(p1-q1) - pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw mm5, 1 ; abs(p1-q1)/2 - - mov rdx, arg(2) ;blimit ; - - movq mm4, [rdx] ;blimit - movq mm1, mm3 ; mm1=mm3=p0 - - movq mm7, mm6 ; mm7=mm6=q0 - psubusb mm1, mm7 ; p0-q0 - - psubusb mm7, mm3 ; q0-p0 - por mm1, mm7 ; abs(q0-p0) - paddusb mm1, mm1 ; abs(q0-p0)*2 - paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit - por mm1, mm0; ; mask - - pxor mm0, mm0 - pcmpeqb mm1, mm0 - - ; calculate high edge variance - mov rdx, arg(4) ;thresh ; get thresh - movq mm7, [rdx] - ; - movq mm4, t0 ; get abs (q1 - q0) - psubusb mm4, mm7 ; abs(q1 - q0) > thresh - - movq mm3, t1 ; get abs (p1 - p0) - psubusb mm3, mm7 ; abs(p1 - p0)> thresh - - por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh - pcmpeqb mm4, mm0 - - pcmpeqb mm0, mm0 - pxor mm4, mm0 - - - - - ; start work on filters - lea rdx, srct - - ; start work on filters - movq mm2, [rdx+16] ; p1 - movq mm7, [rdx+40] ; q1 - pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values - psubsb mm2, mm7 ; p1 - q1 - - movq mm6, [rdx+24] ; p0 - movq mm0, [rdx+32] ; q0 - pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values - pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values - - movq mm3, mm0 ; q0 - psubsb mm0, mm6 ; q0 - p0 - paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1) - paddsb mm2, mm0 ; 2 * (q0 - p0) - paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1) - pand mm1, mm2 ; mask filter values we don't care about - - ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0 - movq mm2, mm1 ; vp8_filter - pand mm2, mm4; ; Filter2 = vp8_filter & hev - - movq mm5, mm2 ; - paddsb mm5, [GLOBAL(t3)]; - - pxor mm0, mm0 ; 0 - pxor mm7, mm7 ; 0 - - punpcklbw mm0, mm5 ; e0f0g0h0 - psraw mm0, 11 ; sign extended shift right by 3 - punpckhbw mm7, mm5 ; a0b0c0d0 - psraw mm7, 11 ; sign extended shift right by 3 - packsswb mm0, mm7 ; Filter2 >>=3; - - movq mm5, mm0 ; Filter2 - - paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) - pxor mm0, mm0 ; 0 - pxor mm7, mm7 ; 0 - - punpcklbw mm0, mm2 ; e0f0g0h0 - psraw mm0, 11 ; sign extended shift right by 3 - punpckhbw mm7, mm2 ; a0b0c0d0 - psraw mm7, 11 ; sign extended shift right by 3 - packsswb mm0, mm7 ; Filter2 >>=3; - - ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0 - psubsb mm3, mm0 ; qs0 =qs0 - filter1 - paddsb mm6, mm5 ; ps0 =ps0 + Fitler2 - - ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0 - ; vp8_filter &= ~hev; - ; Filter2 = vp8_filter; - pandn mm4, mm1 ; vp8_filter&=~hev - - - ; mm3=qs0, mm4=filter2, mm6=ps0 - - ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); - ; s = vp8_signed_char_clamp(qs0 - u); - ; *oq0 = s^0x80; - ; s = vp8_signed_char_clamp(ps0 + u); - ; *op0 = s^0x80; - pxor mm0, mm0 - - pxor mm1, mm1 - pxor mm2, mm2 - punpcklbw mm1, mm4 - punpckhbw mm2, mm4 - pmulhw mm1, [GLOBAL(s27)] - pmulhw mm2, [GLOBAL(s27)] - paddw mm1, [GLOBAL(s63)] - paddw mm2, [GLOBAL(s63)] - psraw mm1, 7 - psraw mm2, 7 - packsswb mm1, mm2 - - psubsb mm3, mm1 - paddsb mm6, mm1 - - pxor mm3, [GLOBAL(t80)] - pxor mm6, [GLOBAL(t80)] - movq [rdx+24], mm6 - movq [rdx+32], mm3 - - ; roughly 2/7th difference across boundary - ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); - ; s = vp8_signed_char_clamp(qs1 - u); - ; *oq1 = s^0x80; - ; s = vp8_signed_char_clamp(ps1 + u); - ; *op1 = s^0x80; - pxor mm1, mm1 - pxor mm2, mm2 - punpcklbw mm1, mm4 - punpckhbw mm2, mm4 - pmulhw mm1, [GLOBAL(s18)] - pmulhw mm2, [GLOBAL(s18)] - paddw mm1, [GLOBAL(s63)] - paddw mm2, [GLOBAL(s63)] - psraw mm1, 7 - psraw mm2, 7 - packsswb mm1, mm2 - - movq mm3, [rdx + 40] - movq mm6, [rdx + 16] ; p1 - pxor mm3, [GLOBAL(t80)] - pxor mm6, [GLOBAL(t80)] - - paddsb mm6, mm1 - psubsb mm3, mm1 - - pxor mm6, [GLOBAL(t80)] - pxor mm3, [GLOBAL(t80)] - movq [rdx + 40], mm3 - movq [rdx + 16], mm6 - - ; roughly 1/7th difference across boundary - ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); - ; s = vp8_signed_char_clamp(qs2 - u); - ; *oq2 = s^0x80; - ; s = vp8_signed_char_clamp(ps2 + u); - ; *op2 = s^0x80; - pxor mm1, mm1 - pxor mm2, mm2 - punpcklbw mm1, mm4 - punpckhbw mm2, mm4 - pmulhw mm1, [GLOBAL(s9)] - pmulhw mm2, [GLOBAL(s9)] - paddw mm1, [GLOBAL(s63)] - paddw mm2, [GLOBAL(s63)] - psraw mm1, 7 - psraw mm2, 7 - packsswb mm1, mm2 - - movq mm6, [rdx+ 8] - movq mm3, [rdx+48] - - pxor mm6, [GLOBAL(t80)] - pxor mm3, [GLOBAL(t80)] - - paddsb mm6, mm1 - psubsb mm3, mm1 - - pxor mm6, [GLOBAL(t80)] ; mm6 = 71 61 51 41 31 21 11 01 - pxor mm3, [GLOBAL(t80)] ; mm3 = 76 66 56 46 36 26 15 06 - - ; tranpose and write back - movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00 - movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00 - - punpcklbw mm0, mm6 ; mm0 = 31 30 21 20 11 10 01 00 - punpckhbw mm1, mm6 ; mm3 = 71 70 61 60 51 50 41 40 - - movq mm2, [rdx+16] ; mm2 = 72 62 52 42 32 22 12 02 - movq mm6, mm2 ; mm3 = 72 62 52 42 32 22 12 02 - - punpcklbw mm2, [rdx+24] ; mm2 = 33 32 23 22 13 12 03 02 - punpckhbw mm6, [rdx+24] ; mm3 = 73 72 63 62 53 52 43 42 - - movq mm5, mm0 ; mm5 = 31 30 21 20 11 10 01 00 - punpcklwd mm0, mm2 ; mm0 = 13 12 11 10 03 02 01 00 - - punpckhwd mm5, mm2 ; mm5 = 33 32 31 30 23 22 21 20 - movq mm4, mm1 ; mm4 = 71 70 61 60 51 50 41 40 - - punpcklwd mm1, mm6 ; mm1 = 53 52 51 50 43 42 41 40 - punpckhwd mm4, mm6 ; mm4 = 73 72 71 70 63 62 61 60 - - movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04 - punpcklbw mm2, [rdx+40] ; mm2 = 35 34 25 24 15 14 05 04 - - movq mm6, mm3 ; mm6 = 76 66 56 46 36 26 15 06 - punpcklbw mm6, [rdx+56] ; mm6 = 37 36 27 26 17 16 07 06 - - movq mm7, mm2 ; mm7 = 35 34 25 24 15 14 05 04 - punpcklwd mm2, mm6 ; mm2 = 17 16 15 14 07 06 05 04 - - punpckhwd mm7, mm6 ; mm7 = 37 36 35 34 27 26 25 24 - movq mm6, mm0 ; mm6 = 13 12 11 10 03 02 01 00 - - punpckldq mm0, mm2 ; mm0 = 07 06 05 04 03 02 01 00 - punpckhdq mm6, mm2 ; mm6 = 17 16 15 14 13 12 11 10 - - movq [rsi+rax*4], mm0 ; write out - movq [rdi+rax*4], mm6 ; write out - - movq mm0, mm5 ; mm0 = 33 32 31 30 23 22 21 20 - punpckldq mm0, mm7 ; mm0 = 27 26 25 24 23 22 20 20 - - punpckhdq mm5, mm7 ; mm5 = 37 36 35 34 33 32 31 30 - movq [rsi+rax*2], mm0 ; write out - - movq [rdi+rax*2], mm5 ; write out - movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04 - - punpckhbw mm2, [rdx+40] ; mm2 = 75 74 65 64 54 54 45 44 - punpckhbw mm3, [rdx+56] ; mm3 = 77 76 67 66 57 56 47 46 - - movq mm5, mm2 ; mm5 = 75 74 65 64 54 54 45 44 - punpcklwd mm2, mm3 ; mm2 = 57 56 55 54 47 46 45 44 - - punpckhwd mm5, mm3 ; mm5 = 77 76 75 74 67 66 65 64 - movq mm0, mm1 ; mm0= 53 52 51 50 43 42 41 40 - - movq mm3, mm4 ; mm4 = 73 72 71 70 63 62 61 60 - punpckldq mm0, mm2 ; mm0 = 47 46 45 44 43 42 41 40 - - punpckhdq mm1, mm2 ; mm1 = 57 56 55 54 53 52 51 50 - movq [rsi], mm0 ; write out - - movq [rdi], mm1 ; write out - neg rax - - punpckldq mm3, mm5 ; mm3 = 67 66 65 64 63 62 61 60 - punpckhdq mm4, mm5 ; mm4 = 77 76 75 74 73 72 71 60 - - movq [rsi+rax*2], mm3 - movq [rdi+rax*2], mm4 - - lea rsi, [rsi+rax*8] - dec rcx - - jnz .next8_mbv - - add rsp, 96 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - ;void vp8_loop_filter_simple_horizontal_edge_mmx ;( ; unsigned char *src_ptr, diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm index 295609c58..6f6531c86 100644 --- a/vp8/common/x86/loopfilter_sse2.asm +++ b/vp8/common/x86/loopfilter_sse2.asm @@ -380,302 +380,6 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2): ret -%macro MB_FILTER_AND_WRITEBACK 1 -%if %1 == 0 - movdqa xmm2, p1 ; p1 - movdqa xmm7, q1 ; q1 -%elif %1 == 1 - movdqa xmm2, [rsi+2*rax] ; p1 - movdqa xmm7, [rdi] ; q1 - - mov rcx, rax - neg rcx -%elif %1 == 2 - lea rdx, srct - - movdqa xmm2, [rdx+32] ; p1 - movdqa xmm7, [rdx+80] ; q1 - movdqa xmm6, [rdx+48] ; p0 - movdqa xmm0, [rdx+64] ; q0 -%endif - - pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values - pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values - pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values - - psubsb xmm2, xmm7 ; p1 - q1 - movdqa xmm3, xmm0 ; q0 - - psubsb xmm0, xmm6 ; q0 - p0 - - paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1) - - paddsb xmm2, xmm0 ; 2 * (q0 - p0) - - paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1) - - pand xmm1, xmm2 ; mask filter values we don't care about - - movdqa xmm2, xmm1 ; vp8_filter - - pand xmm2, xmm4 ; Filter2 = vp8_filter & hev - pxor xmm0, xmm0 - - pandn xmm4, xmm1 ; vp8_filter&=~hev - pxor xmm1, xmm1 - - punpcklbw xmm0, xmm4 ; Filter 2 (hi) - movdqa xmm5, xmm2 - - punpckhbw xmm1, xmm4 ; Filter 2 (lo) - paddsb xmm5, [GLOBAL(t3)] ; vp8_signed_char_clamp(Filter2 + 3) - - pmulhw xmm1, [GLOBAL(s9)] ; Filter 2 (lo) * 9 - - pmulhw xmm0, [GLOBAL(s9)] ; Filter 2 (hi) * 9 - - punpckhbw xmm7, xmm5 ; axbxcxdx - paddsb xmm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) - - punpcklbw xmm5, xmm5 ; exfxgxhx - psraw xmm7, 11 ; sign extended shift right by 3 - - psraw xmm5, 11 ; sign extended shift right by 3 - punpckhbw xmm4, xmm2 ; axbxcxdx - - punpcklbw xmm2, xmm2 ; exfxgxhx - psraw xmm4, 11 ; sign extended shift right by 3 - - packsswb xmm5, xmm7 ; Filter2 >>=3; - psraw xmm2, 11 ; sign extended shift right by 3 - - packsswb xmm2, xmm4 ; Filter1 >>=3; - movdqa xmm7, xmm1 - - paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2 - movdqa xmm4, xmm1 - - psubsb xmm3, xmm2 ; qs0 =qs0 - Filter1 - movdqa xmm5, xmm0 - - movdqa xmm2, xmm5 - paddw xmm0, [GLOBAL(s63)] ; Filter 2 (hi) * 9 + 63 - - paddw xmm1, [GLOBAL(s63)] ; Filter 2 (lo) * 9 + 63 - paddw xmm5, xmm5 ; Filter 2 (hi) * 18 - - paddw xmm7, xmm7 ; Filter 2 (lo) * 18 - paddw xmm5, xmm0 ; Filter 2 (hi) * 27 + 63 - - paddw xmm7, xmm1 ; Filter 2 (lo) * 27 + 63 - paddw xmm2, xmm0 ; Filter 2 (hi) * 18 + 63 - - paddw xmm4, xmm1 ; Filter 2 (lo) * 18 + 63 - psraw xmm0, 7 ; (Filter 2 (hi) * 9 + 63) >> 7 - - psraw xmm1, 7 ; (Filter 2 (lo) * 9 + 63) >> 7 - psraw xmm2, 7 ; (Filter 2 (hi) * 18 + 63) >> 7 - - packsswb xmm0, xmm1 ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) - psraw xmm4, 7 ; (Filter 2 (lo) * 18 + 63) >> 7 - - psraw xmm5, 7 ; (Filter 2 (hi) * 27 + 63) >> 7 - packsswb xmm2, xmm4 ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) - - psraw xmm7, 7 ; (Filter 2 (lo) * 27 + 63) >> 7 - - packsswb xmm5, xmm7 ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) - - psubsb xmm3, xmm5 ; sq = vp8_signed_char_clamp(qs0 - u3) - paddsb xmm6, xmm5 ; sp = vp8_signed_char_clamp(ps0 - u3) - -%if %1 == 0 - movdqa xmm5, q2 ; q2 - movdqa xmm1, q1 ; q1 - movdqa xmm4, p1 ; p1 - movdqa xmm7, p2 ; p2 - -%elif %1 == 1 - movdqa xmm5, XMMWORD PTR [rdi+rcx] ; q2 - movdqa xmm1, XMMWORD PTR [rdi] ; q1 - movdqa xmm4, XMMWORD PTR [rsi+rax*2] ; p1 - movdqa xmm7, XMMWORD PTR [rdi+rax*4] ; p2 -%elif %1 == 2 - movdqa xmm5, XMMWORD PTR [rdx+96] ; q2 - movdqa xmm1, XMMWORD PTR [rdx+80] ; q1 - movdqa xmm4, XMMWORD PTR [rdx+32] ; p1 - movdqa xmm7, XMMWORD PTR [rdx+16] ; p2 -%endif - - pxor xmm3, [GLOBAL(t80)] ; *oq0 = sq^0x80 - pxor xmm6, [GLOBAL(t80)] ; *oq0 = sp^0x80 - - pxor xmm1, [GLOBAL(t80)] - pxor xmm4, [GLOBAL(t80)] - - psubsb xmm1, xmm2 ; sq = vp8_signed_char_clamp(qs1 - u2) - paddsb xmm4, xmm2 ; sp = vp8_signed_char_clamp(ps1 - u2) - - pxor xmm1, [GLOBAL(t80)] ; *oq1 = sq^0x80; - pxor xmm4, [GLOBAL(t80)] ; *op1 = sp^0x80; - - pxor xmm7, [GLOBAL(t80)] - pxor xmm5, [GLOBAL(t80)] - - paddsb xmm7, xmm0 ; sp = vp8_signed_char_clamp(ps2 - u) - psubsb xmm5, xmm0 ; sq = vp8_signed_char_clamp(qs2 - u) - - pxor xmm7, [GLOBAL(t80)] ; *op2 = sp^0x80; - pxor xmm5, [GLOBAL(t80)] ; *oq2 = sq^0x80; - -%if %1 == 0 - lea rsi, [rsi+rcx*2] - lea rdi, [rdi+rcx*2] - - movq MMWORD PTR [rsi], xmm6 ; p0 - movhps MMWORD PTR [rdi], xmm6 - movq MMWORD PTR [rsi + rcx], xmm3 ; q0 - movhps MMWORD PTR [rdi + rcx], xmm3 - - movq MMWORD PTR [rsi+rcx*2], xmm1 ; q1 - movhps MMWORD PTR [rdi+rcx*2], xmm1 - - movq MMWORD PTR [rsi + rax], xmm4 ; p1 - movhps MMWORD PTR [rdi + rax], xmm4 - - movq MMWORD PTR [rsi+rax*2], xmm7 ; p2 - movhps MMWORD PTR [rdi+rax*2], xmm7 - - lea rsi, [rsi + rcx] - lea rdi, [rdi + rcx] - movq MMWORD PTR [rsi+rcx*2], xmm5 ; q2 - movhps MMWORD PTR [rdi+rcx*2], xmm5 -%elif %1 == 1 - movdqa XMMWORD PTR [rdi+rcx], xmm5 ; q2 - movdqa XMMWORD PTR [rdi], xmm1 ; q1 - movdqa XMMWORD PTR [rsi], xmm3 ; q0 - movdqa XMMWORD PTR [rsi+rax ],xmm6 ; p0 - movdqa XMMWORD PTR [rsi+rax*2],xmm4 ; p1 - movdqa XMMWORD PTR [rdi+rax*4],xmm7 ; p2 -%elif %1 == 2 - movdqa XMMWORD PTR [rdx+80], xmm1 ; q1 - movdqa XMMWORD PTR [rdx+64], xmm3 ; q0 - movdqa XMMWORD PTR [rdx+48], xmm6 ; p0 - movdqa XMMWORD PTR [rdx+32], xmm4 ; p1 -%endif - -%endmacro - - -;void vp8_mbloop_filter_horizontal_edge_sse2 -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; int count -;) -global sym(vp8_mbloop_filter_horizontal_edge_sse2) -sym(vp8_mbloop_filter_horizontal_edge_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 32 ; reserve 32 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step - - mov rdx, arg(3) ;limit - movdqa xmm7, XMMWORD PTR [rdx] - - lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing - - ; calculate breakout conditions and high edge variance - LFH_FILTER_AND_HEV_MASK 1 - ; filter and write back the results - MB_FILTER_AND_WRITEBACK 1 - - add rsp, 32 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_mbloop_filter_horizontal_edge_uv_sse2 -;( -; unsigned char *u, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; unsigned char *v -;) -global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2) -sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 96 ; reserve 96 bytes - %define q2 [rsp + 0] ;__declspec(align(16)) char q2[16]; - %define q1 [rsp + 16] ;__declspec(align(16)) char q1[16]; - %define p2 [rsp + 32] ;__declspec(align(16)) char p2[16]; - %define p1 [rsp + 48] ;__declspec(align(16)) char p1[16]; - %define t0 [rsp + 64] ;__declspec(align(16)) char t0[16]; - %define t1 [rsp + 80] ;__declspec(align(16)) char t1[16]; - - mov rsi, arg(0) ; u - mov rdi, arg(5) ; v - movsxd rax, dword ptr arg(1) ; src_pixel_step - mov rcx, rax - neg rax ; negate pitch to deal with above border - - mov rdx, arg(3) ;limit - movdqa xmm7, XMMWORD PTR [rdx] - - lea rsi, [rsi + rcx] - lea rdi, [rdi + rcx] - - ; calculate breakout conditions and high edge variance - LFH_FILTER_AND_HEV_MASK 0 - ; filter and write back the results - MB_FILTER_AND_WRITEBACK 0 - - add rsp, 96 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - %macro TRANSPOSE_16X8 2 movq xmm4, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00 movq xmm1, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10 @@ -1141,233 +845,6 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2): pop rbp ret -%macro MBV_TRANSPOSE 0 - movdqa xmm0, [rdx] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 - movdqa xmm1, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 - - punpcklbw xmm0, xmm7 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 - punpckhbw xmm1, xmm7 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 - - movdqa xmm2, [rdx+32] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - movdqa xmm6, xmm2 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - - punpcklbw xmm2, [rdx+48] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 - punpckhbw xmm6, [rdx+48] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 - - movdqa xmm3, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 - punpcklwd xmm0, xmm2 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 - - punpckhwd xmm3, xmm2 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 - movdqa xmm4, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 - - punpcklwd xmm1, xmm6 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 - punpckhwd xmm4, xmm6 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 - - movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 - punpcklbw xmm2, [rdx+80] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 - - movdqa xmm6, xmm5 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 - punpcklbw xmm6, [rdx+112] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06 - - movdqa xmm7, xmm2 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 - punpcklwd xmm2, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04 - - punpckhwd xmm7, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44 - movdqa xmm6, xmm0 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 - - punpckldq xmm0, xmm2 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00 - punpckhdq xmm6, xmm2 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20 -%endmacro - -%macro MBV_WRITEBACK_1 0 - movq QWORD PTR [rsi], xmm0 - movhps MMWORD PTR [rdi], xmm0 - - movq QWORD PTR [rsi+2*rax], xmm6 - movhps MMWORD PTR [rdi+2*rax], xmm6 - - movdqa xmm0, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 - punpckldq xmm0, xmm7 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40 - - punpckhdq xmm3, xmm7 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60 - - movq QWORD PTR [rsi+4*rax], xmm0 - movhps MMWORD PTR [rdi+4*rax], xmm0 - - movq QWORD PTR [rsi+2*rcx], xmm3 - movhps MMWORD PTR [rdi+2*rcx], xmm3 - - movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 - punpckhbw xmm2, [rdx+80] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 - - punpckhbw xmm5, [rdx+112] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86 - movdqa xmm0, xmm2 - - punpcklwd xmm0, xmm5 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84 - punpckhwd xmm2, xmm5 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4 - - movdqa xmm5, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 - punpckldq xmm1, xmm0 ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80 - - punpckhdq xmm5, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0 -%endmacro - -%macro MBV_WRITEBACK_2 0 - movq QWORD PTR [rsi], xmm1 - movhps MMWORD PTR [rdi], xmm1 - - movq QWORD PTR [rsi+2*rax], xmm5 - movhps MMWORD PTR [rdi+2*rax], xmm5 - - movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 - punpckldq xmm1, xmm2 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0 - punpckhdq xmm4, xmm2 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0 - - movq QWORD PTR [rsi+4*rax], xmm1 - movhps MMWORD PTR [rdi+4*rax], xmm1 - - movq QWORD PTR [rsi+2*rcx], xmm4 - movhps MMWORD PTR [rdi+2*rcx], xmm4 -%endmacro - - -;void vp8_mbloop_filter_vertical_edge_sse2 -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; int count -;) -global sym(vp8_mbloop_filter_vertical_edge_sse2) -sym(vp8_mbloop_filter_vertical_edge_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 160 ; reserve 160 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; - %define srct [rsp + 32] ;__declspec(align(16)) char srct[128]; - - mov rsi, arg(0) ; src_ptr - movsxd rax, dword ptr arg(1) ; src_pixel_step - - lea rsi, [rsi - 4] - lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing - lea rcx, [rax*2+rax] - - ; Transpose - TRANSPOSE_16X8 1, 0 - - ; calculate filter mask and high edge variance - LFV_FILTER_MASK_HEV_MASK 0 - - neg rax - ; start work on filters - MB_FILTER_AND_WRITEBACK 2 - - lea rsi, [rsi+rax*8] - lea rdi, [rdi+rax*8] - - ; transpose and write back - MBV_TRANSPOSE - - neg rax - - MBV_WRITEBACK_1 - - lea rsi, [rsi+rax*8] - lea rdi, [rdi+rax*8] - MBV_WRITEBACK_2 - - add rsp, 160 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_mbloop_filter_vertical_edge_uv_sse2 -;( -; unsigned char *u, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; unsigned char *v -;) -global sym(vp8_mbloop_filter_vertical_edge_uv_sse2) -sym(vp8_mbloop_filter_vertical_edge_uv_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 160 ; reserve 160 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; - %define srct [rsp + 32] ;__declspec(align(16)) char srct[128]; - - mov rsi, arg(0) ; u_ptr - movsxd rax, dword ptr arg(1) ; src_pixel_step - - lea rsi, [rsi - 4] - lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing - lea rcx, [rax+2*rax] - - lea rdx, srct - - ; Transpose - TRANSPOSE_16X8 0, 0 - - ; calculate filter mask and high edge variance - LFV_FILTER_MASK_HEV_MASK 0 - - ; start work on filters - MB_FILTER_AND_WRITEBACK 2 - - ; transpose and write back - MBV_TRANSPOSE - - mov rsi, arg(0) ;u_ptr - lea rsi, [rsi - 4] - lea rdi, [rsi + rax] - MBV_WRITEBACK_1 - mov rsi, arg(5) ;v_ptr - lea rsi, [rsi - 4] - lea rdi, [rsi + rax] - MBV_WRITEBACK_2 - - add rsp, 160 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - ;void vp8_loop_filter_simple_horizontal_edge_sse2 ;( ; unsigned char *src_ptr, diff --git a/vp8/common/x86/loopfilter_x86.c b/vp8/common/x86/loopfilter_x86.c index e7239818e..716d10c79 100644 --- a/vp8/common/x86/loopfilter_x86.c +++ b/vp8/common/x86/loopfilter_x86.c @@ -9,63 +9,36 @@ */ +#include <emmintrin.h> // SSE2 #include "vpx_config.h" #include "vp8/common/loopfilter.h" -prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx); -prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx); prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx); prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx); prototype_loopfilter(vp8_loop_filter_vertical_edge_sse2); prototype_loopfilter(vp8_loop_filter_horizontal_edge_sse2); -prototype_loopfilter(vp8_mbloop_filter_vertical_edge_sse2); -prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_sse2); extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2; extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2; -extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_sse2; -extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2; #if HAVE_MMX /* Horizontal MB filtering */ void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi) { - vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); - - if (u_ptr) - vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); - - if (v_ptr) - vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); } /* Vertical MB Filtering */ void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi) { - vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); - - if (u_ptr) - vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); - - if (v_ptr) - vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); } /* Horizontal B Filtering */ void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi) { - vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - - if (u_ptr) - vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); - if (v_ptr) - vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); } @@ -99,26 +72,413 @@ void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, const unsigned #endif -/* Horizontal MB filtering */ #if HAVE_SSE2 +void vp8_mbloop_filter_horizontal_edge_c_sse2 +( + unsigned char *s, + int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh, + int count +) { + DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); + __m128i mask, hev, flat; + __m128i thresh, limit, blimit; + const __m128i zero = _mm_set1_epi16(0); + __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; + + thresh = _mm_shuffle_epi32(_mm_cvtsi32_si128(_thresh[0] * 0x01010101), 0); + limit = _mm_shuffle_epi32(_mm_cvtsi32_si128(_limit[0] * 0x01010101), 0); + blimit = _mm_shuffle_epi32(_mm_cvtsi32_si128(_blimit[0] * 0x01010101), 0); + + p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); + { + const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), + _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), + _mm_subs_epu8(q0, q1)); + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), + _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), + _mm_subs_epu8(q1, p1)); + __m128i work; + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), + _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), + _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), + _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), + _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), + _mm_subs_epu8(p0, p2)), + _mm_or_si128(_mm_subs_epu8(q2, q0), + _mm_subs_epu8(q0, q2))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), + _mm_subs_epu8(p0, p3)), + _mm_or_si128(_mm_subs_epu8(q3, q0), + _mm_subs_epu8(q0, q3))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0), + _mm_subs_epu8(p0, p4)), + _mm_or_si128(_mm_subs_epu8(q4, q0), + _mm_subs_epu8(q0, q4))); + flat = _mm_max_epu8(work, flat); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + } + { + const __m128i four = _mm_set1_epi16(4); + unsigned char *src = s; + int i = 0; + do { + __m128i workp_a, workp_b, workp_shft; + p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero); + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); + q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero); + + workp_a = _mm_add_epi16(_mm_add_epi16(p4, p3), _mm_add_epi16(p2, p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); + workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p4); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op2[i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op1[i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op0[i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq0[i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q4); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq1[i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q4); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq2[i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + src += 8; + } while (++i < count); + } + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + + const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), + t80); + const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), + t80); + const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), + t80); + const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), + t80); + __m128i vp8_filt; + __m128i work_a; + __m128i filter1, filter2; + + vp8_filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + vp8_filt = _mm_adds_epi8(vp8_filt, work_a); + vp8_filt = _mm_adds_epi8(vp8_filt, work_a); + vp8_filt = _mm_adds_epi8(vp8_filt, work_a); + /* (vp8_filter + 3 * (qs0 - ps0)) & mask */ + vp8_filt = _mm_and_si128(vp8_filt, mask); + + filter1 = _mm_adds_epi8(vp8_filt, t4); + filter2 = _mm_adds_epi8(vp8_filt, t3); + + /* Filter1 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + + /* Filter2 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + + /* vp8_filt >> 1 */ + vp8_filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, vp8_filt); + vp8_filt = _mm_srli_epi16(vp8_filt, 1); + work_a = _mm_and_si128(work_a, t80); + vp8_filt = _mm_and_si128(vp8_filt, t7f); + vp8_filt = _mm_or_si128(vp8_filt, work_a); + + vp8_filt = _mm_andnot_si128(hev, vp8_filt); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + q0 = _mm_load_si128((__m128i *)flat_oq0); + work_a = _mm_andnot_si128(flat, work_a); + q0 = _mm_and_si128(flat, q0); + q0 = _mm_or_si128(work_a, q0); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs1, vp8_filt), t80); + q1 = _mm_load_si128((__m128i *)flat_oq1); + work_a = _mm_andnot_si128(flat, work_a); + q1 = _mm_and_si128(flat, q1); + q1 = _mm_or_si128(work_a, q1); + + work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q2 = _mm_load_si128((__m128i *)flat_oq2); + work_a = _mm_andnot_si128(flat, work_a); + q2 = _mm_and_si128(flat, q2); + q2 = _mm_or_si128(work_a, q2); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + p0 = _mm_load_si128((__m128i *)flat_op0); + work_a = _mm_andnot_si128(flat, work_a); + p0 = _mm_and_si128(flat, p0); + p0 = _mm_or_si128(work_a, p0); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps1, vp8_filt), t80); + p1 = _mm_load_si128((__m128i *)flat_op1); + work_a = _mm_andnot_si128(flat, work_a); + p1 = _mm_and_si128(flat, p1); + p1 = _mm_or_si128(work_a, p1); + + work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p2 = _mm_load_si128((__m128i *)flat_op2); + work_a = _mm_andnot_si128(flat, work_a); + p2 = _mm_and_si128(flat, p2); + p2 = _mm_or_si128(work_a, p2); + + if (count == 1) { + _mm_storel_epi64((__m128i *)(s - 3 * p), p2); + _mm_storel_epi64((__m128i *)(s - 2 * p), p1); + _mm_storel_epi64((__m128i *)(s - 1 * p), p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), q1); + _mm_storel_epi64((__m128i *)(s + 2 * p), q2); + } else { + _mm_storeu_si128((__m128i *)(s - 3 * p), p2); + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storeu_si128((__m128i *)(s + 2 * p), q2); + } + } +} +static __inline void transpose(unsigned char *src[], int in_p, + unsigned char *dst[], int out_p, + int num_8x8_to_transpose) { + int idx8x8 = 0; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + + do { + unsigned char *in = src[idx8x8]; + unsigned char *out = dst[idx8x8]; + + x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07 + x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17 + x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27 + x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37 + x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47 + x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57 + x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67 + x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77 + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + x0 = _mm_unpacklo_epi8(x0, x1); + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + x1 = _mm_unpacklo_epi8(x2, x3); + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + x2 = _mm_unpacklo_epi8(x4, x5); + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + x3 = _mm_unpacklo_epi8(x6, x7); + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + x4 = _mm_unpacklo_epi16(x0, x1); + // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + x5 = _mm_unpacklo_epi16(x2, x3); + // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + x6 = _mm_unpacklo_epi32(x4, x5); + // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + x7 = _mm_unpackhi_epi32(x4, x5); + + _mm_storel_pd((double *)(out + 0*out_p), + _mm_cvtepi32_pd(x6)); // 00 10 20 30 40 50 60 70 + _mm_storeh_pd((double *)(out + 1*out_p), + _mm_cvtepi32_pd(x6)); // 01 11 21 31 41 51 61 71 + _mm_storel_pd((double *)(out + 2*out_p), + _mm_cvtepi32_pd(x7)); // 02 12 22 32 42 52 62 72 + _mm_storeh_pd((double *)(out + 3*out_p), + _mm_cvtepi32_pd(x7)); // 03 13 23 33 43 53 63 73 + + // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + x4 = _mm_unpackhi_epi16(x0, x1); + // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + x5 = _mm_unpackhi_epi16(x2, x3); + // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + x6 = _mm_unpacklo_epi32(x4, x5); + // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + x7 = _mm_unpackhi_epi32(x4, x5); + + _mm_storel_pd((double *)(out + 4*out_p), + _mm_cvtepi32_pd(x6)); // 04 14 24 34 44 54 64 74 + _mm_storeh_pd((double *)(out + 5*out_p), + _mm_cvtepi32_pd(x6)); // 05 15 25 35 45 55 65 75 + _mm_storel_pd((double *)(out + 6*out_p), + _mm_cvtepi32_pd(x7)); // 06 16 26 36 46 56 66 76 + _mm_storeh_pd((double *)(out + 7*out_p), + _mm_cvtepi32_pd(x7)); // 07 17 27 37 47 57 67 77 + } while (++idx8x8 < num_8x8_to_transpose); +} +void vp8_mbloop_filter_vertical_edge_c_sse2 +( + unsigned char *s, + int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count +) { + DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 16]); + unsigned char *src[4]; + unsigned char *dst[4]; + + src[0] = s - 5; + src[1] = s - 5 + 8; + src[2] = s - 5 + p*8; + src[3] = s - 5 + p*8 + 8; + + dst[0] = t_dst; + dst[1] = t_dst + 16*8; + dst[2] = t_dst + 8; + dst[3] = t_dst + 16*8 + 8; + + // 16x16->16x16 or 16x8->8x16 + transpose(src, p, dst, 16, (1 << count)); + + vp8_mbloop_filter_horizontal_edge_c_sse2(t_dst + 5*16, 16, blimit, limit, + thresh, count); + + dst[0] = s - 5; + dst[1] = s - 5 + p*8; + + src[0] = t_dst; + src[1] = t_dst + 8; + + // 16x8->8x16 or 8x8->8x8 + transpose(src, 16, dst, p, (1 << (count - 1))); +} + +/* Horizontal MB filtering */ void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi) { - vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); + vp8_mbloop_filter_horizontal_edge_c_sse2(y_ptr, y_stride, lfi->mblim, + lfi->lim, lfi->hev_thr, 2); + + /* TODO: write sse2 version with u,v interleaved */ if (u_ptr) - vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr); + vp8_mbloop_filter_horizontal_edge_c_sse2(u_ptr, uv_stride, lfi->mblim, + lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_horizontal_edge_c_sse2(v_ptr, uv_stride, lfi->mblim, + lfi->lim, lfi->hev_thr, 1); } +void vp8_loop_filter_bh8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + struct loop_filter_info *lfi) { + vp8_mbloop_filter_horizontal_edge_c_sse2( + y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); +} /* Vertical MB Filtering */ -void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, struct loop_filter_info *lfi) { - vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); +void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + struct loop_filter_info *lfi) { + vp8_mbloop_filter_vertical_edge_c_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 2); + /* TODO: write sse2 version with u,v interleaved */ if (u_ptr) - vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr); + vp8_mbloop_filter_vertical_edge_c_sse2(u_ptr, uv_stride, lfi->mblim, + lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_vertical_edge_c_sse2(v_ptr, uv_stride, lfi->mblim, + lfi->lim, lfi->hev_thr, 1); } +void vp8_loop_filter_bv8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + struct loop_filter_info *lfi) { + vp8_mbloop_filter_vertical_edge_c_sse2( + y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); +} /* Horizontal B Filtering */ void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, |