diff options
Diffstat (limited to 'vp9/common')
-rw-r--r-- | vp9/common/arm/neon/vp9_loopfilter_16_neon.asm | 25 | ||||
-rw-r--r-- | vp9/common/arm/neon/vp9_loopfilter_16_neon.c | 42 | ||||
-rw-r--r-- | vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c | 55 | ||||
-rw-r--r-- | vp9/common/vp9_alloccommon.c | 2 | ||||
-rw-r--r-- | vp9/common/vp9_blockd.h | 14 | ||||
-rw-r--r-- | vp9/common/vp9_entropy.c | 85 | ||||
-rw-r--r-- | vp9/common/vp9_entropy.h | 8 | ||||
-rw-r--r-- | vp9/common/vp9_entropymode.c | 12 | ||||
-rw-r--r-- | vp9/common/vp9_entropymode.h | 11 | ||||
-rw-r--r-- | vp9/common/vp9_enums.h | 20 | ||||
-rw-r--r-- | vp9/common/vp9_loopfilter.c | 54 | ||||
-rw-r--r-- | vp9/common/vp9_loopfilter_filters.c | 81 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.sh | 29 | ||||
-rw-r--r-- | vp9/common/x86/vp9_loopfilter_intrin_sse2.c | 182 |
14 files changed, 352 insertions, 268 deletions
diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm b/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm index e559272cd..751bc74bc 100644 --- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm +++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm @@ -112,27 +112,27 @@ vabd.u8 q4, q10, q9 ; m6 = abs(q3 - q2) ; only compare the largest value to limit - vmax.u8 q11, q11, q12 ; m1 = max(m1, m2) - vmax.u8 q12, q13, q14 ; m2 = max(m3, m4) + vmax.u8 q11, q11, q12 ; m7 = max(m1, m2) + vmax.u8 q12, q13, q14 ; m8 = max(m3, m4) vabd.u8 q9, q6, q7 ; abs(p0 - q0) - vmax.u8 q3, q3, q4 ; m3 = max(m5, m6) + vmax.u8 q3, q3, q4 ; m9 = max(m5, m6) vmov.u8 q10, #0x80 - vmax.u8 q15, q11, q12 ; m1 = max(m1, m2) + vmax.u8 q15, q11, q12 ; m10 = max(m7, m8) vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 - vmax.u8 q15, q15, q3 ; m1 = max(m1, m3) + vmax.u8 q15, q15, q3 ; m11 = max(m10, m9) vabd.u8 q2, q5, q8 ; a = abs(p1 - q1) vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2 veor q7, q7, q10 ; qs0 - vcge.u8 q15, q1, q15 ; abs(m1) > limit + vcge.u8 q15, q1, q15 ; abs(m11) > limit vshr.u8 q2, q2, #1 ; a = a / 2 veor q6, q6, q10 ; ps0 @@ -142,7 +142,7 @@ veor q8, q8, q10 ; qs1 - vmov.u8 q4, #3 + vmov.u16 q4, #3 vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) vsubl.s8 q11, d15, d13 @@ -150,13 +150,15 @@ vcge.u8 q9, q0, q9 ; a > blimit vqsub.s8 q1, q5, q8 ; filter = clamp(ps1-qs1) - vorr q14, q13, q14 ; hevmask + vorr q14, q13, q14 ; hev vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0) vmul.i16 q11, q11, q4 vand q1, q1, q14 ; filter &= hev - vand q15, q15, q9 ; filter_mask + vand q15, q15, q9 ; mask + + vmov.u8 q4, #3 vaddw.s8 q2, q2, d2 ; filter + 3 * (qs0 - ps0) vaddw.s8 q11, q11, d3 @@ -180,15 +182,14 @@ ; outer tap adjustments vrshr.s8 q1, q1, #1 ; filter = ++filter1 >> 1 - veor q6, q11, q10 ; *op0 = u^0x80 + veor q7, q0, q10 ; *oq0 = u^0x80 vbic q1, q1, q14 ; filter &= ~hev vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + filter) vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - filter) - - veor q7, q0, q10 ; *oq0 = u^0x80 + veor q6, q11, q10 ; *op0 = u^0x80 veor q5, q13, q10 ; *op1 = u^0x80 veor q8, q12, q10 ; *oq1 = u^0x80 diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c index 2f022dc1d..b97e7aa4a 100644 --- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c +++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c @@ -10,17 +10,6 @@ #include "./vp9_rtcd.h" -void vp9_loop_filter_horizontal_edge_16_neon(uint8_t *s, int p /* pitch */, - const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, - const uint8_t *thresh1) { - vp9_loop_filter_horizontal_edge(s, p, blimit0, limit0, thresh0, 1); - vp9_loop_filter_horizontal_edge(s + 8, p, blimit1, limit1, thresh1, 1); -} - void vp9_mbloop_filter_horizontal_edge_16_neon(uint8_t *s, int p /* pitch */, const uint8_t *blimit0, const uint8_t *limit0, @@ -31,3 +20,34 @@ void vp9_mbloop_filter_horizontal_edge_16_neon(uint8_t *s, int p /* pitch */, vp9_mbloop_filter_horizontal_edge(s, p, blimit0, limit0, thresh0, 1); vp9_mbloop_filter_horizontal_edge(s + 8, p, blimit1, limit1, thresh1, 1); } + +void vp9_loop_filter_vertical_edge_16_neon(uint8_t *s, int p, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_loop_filter_vertical_edge_neon(s, p, blimit0, limit0, thresh0, 1); + vp9_loop_filter_vertical_edge_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1); +} + +void vp9_mbloop_filter_vertical_edge_16_neon(uint8_t *s, int p, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_mbloop_filter_vertical_edge_neon(s, p, blimit0, limit0, thresh0, 1); + vp9_mbloop_filter_vertical_edge_neon(s + 8 * p, p, blimit1, limit1, thresh1, + 1); +} + +void vp9_mb_lpf_vertical_edge_w_16_neon(uint8_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + vp9_mb_lpf_vertical_edge_w_neon(s, p, blimit, limit, thresh); + vp9_mb_lpf_vertical_edge_w_neon(s + 8 * p, p, blimit, limit, thresh); +} diff --git a/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c b/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c index 36cfc83c4..0c0f155ae 100644 --- a/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c +++ b/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c @@ -306,4 +306,59 @@ void vp9_loop_filter_vertical_edge_dspr2(unsigned char *s, } } } + +void vp9_loop_filter_horizontal_edge_16_dspr2(uint8_t *s, int p /* pitch */, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_loop_filter_horizontal_edge_dspr2(s, p, blimit0, limit0, thresh0, 1); + vp9_loop_filter_horizontal_edge_dspr2(s + 8, p, blimit1, limit1, thresh1, 1); +} + +void vp9_mbloop_filter_horizontal_edge_16_dspr2(uint8_t *s, int p /* pitch */, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_mbloop_filter_horizontal_edge_dspr2(s, p, blimit0, limit0, thresh0, 1); + vp9_mbloop_filter_horizontal_edge_dspr2(s + 8, p, blimit1, limit1, thresh1, + 1); +} + +void vp9_loop_filter_vertical_edge_16_dspr2(uint8_t *s, int p, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_loop_filter_vertical_edge_dspr2(s, p, blimit0, limit0, thresh0, 1); + vp9_loop_filter_vertical_edge_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, + 1); +} + +void vp9_mbloop_filter_vertical_edge_16_dspr2(uint8_t *s, int p, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_mbloop_filter_vertical_edge_dspr2(s, p, blimit0, limit0, thresh0, 1); + vp9_mbloop_filter_vertical_edge_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, + 1); +} + +void vp9_mb_lpf_vertical_edge_w_16_dspr2(uint8_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + vp9_mb_lpf_vertical_edge_w_dspr2(s, p, blimit, limit, thresh); + vp9_mb_lpf_vertical_edge_w_dspr2(s + 8 * p, p, blimit, limit, thresh); +} #endif // #if HAVE_DSPR2 diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c index 6e12638e3..28671c38c 100644 --- a/vp9/common/vp9_alloccommon.c +++ b/vp9/common/vp9_alloccommon.c @@ -211,8 +211,6 @@ void vp9_remove_common(VP9_COMMON *cm) { void vp9_initialize_common() { vp9_init_neighbors(); - vp9_coef_tree_initialize(); - vp9_entropy_mode_init(); } void vp9_update_frame_size(VP9_COMMON *cm) { diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index 37da92bd3..993ee7935 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -198,7 +198,6 @@ struct buf_2d { }; struct macroblockd_plane { - int16_t *qcoeff; int16_t *dqcoeff; uint16_t *eobs; PLANE_TYPE plane_type; @@ -387,19 +386,6 @@ static INLINE void foreach_transformed_block_uv( foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg); } -static int raster_block_offset(BLOCK_SIZE plane_bsize, - int raster_block, int stride) { - const int bw = b_width_log2(plane_bsize); - const int y = 4 * (raster_block >> bw); - const int x = 4 * (raster_block & ((1 << bw) - 1)); - return y * stride + x; -} -static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize, - int raster_block, int16_t *base) { - const int stride = 4 << b_width_log2(plane_bsize); - return base + raster_block_offset(plane_bsize, raster_block, stride); -} - static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int block, int *x, int *y) { diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c index 0f978cc95..b35c43fcd 100644 --- a/vp9/common/vp9_entropy.c +++ b/vp9/common/vp9_entropy.c @@ -113,49 +113,6 @@ DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = { 0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5 }; -// Array indices are identical to previously-existing CONTEXT_NODE indices -const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)] = { - -DCT_EOB_TOKEN, 2, /* 0 = EOB */ - -ZERO_TOKEN, 4, /* 1 = ZERO */ - -ONE_TOKEN, 6, /* 2 = ONE */ - 8, 12, /* 3 = LOW_VAL */ - -TWO_TOKEN, 10, /* 4 = TWO */ - -THREE_TOKEN, -FOUR_TOKEN, /* 5 = THREE */ - 14, 16, /* 6 = HIGH_LOW */ - -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2, /* 7 = CAT_ONE */ - 18, 20, /* 8 = CAT_THREEFOUR */ - -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4, /* 9 = CAT_THREE */ - -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6 /* 10 = CAT_FIVE */ -}; - -// Unconstrained Node Tree -const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)] = { - 2, 6, /* 0 = LOW_VAL */ - -TWO_TOKEN, 4, /* 1 = TWO */ - -THREE_TOKEN, -FOUR_TOKEN, /* 2 = THREE */ - 8, 10, /* 3 = HIGH_LOW */ - -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2, /* 4 = CAT_ONE */ - 12, 14, /* 5 = CAT_THREEFOUR */ - -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4, /* 6 = CAT_THREE */ - -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6 /* 7 = CAT_FIVE */ -}; - - - -struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS]; - -/* Trees for extra bits. Probabilities are constant and - do not depend on previously encoded bits */ - -static const vp9_prob Pcat1[] = { 159}; -static const vp9_prob Pcat2[] = { 165, 145}; -static const vp9_prob Pcat3[] = { 173, 148, 140}; -static const vp9_prob Pcat4[] = { 176, 155, 140, 135}; -static const vp9_prob Pcat5[] = { 180, 157, 141, 134, 130}; -static const vp9_prob Pcat6[] = { - 254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129 -}; - const vp9_tree_index vp9_coefmodel_tree[6] = { -DCT_EOB_MODEL_TOKEN, 2, /* 0 = EOB */ -ZERO_TOKEN, 4, /* 1 = ZERO */ @@ -446,43 +403,6 @@ void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full) { extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]); } -static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28]; - -static void init_bit_tree(vp9_tree_index *p, int n) { - int i = 0; - - while (++i < n) { - p[0] = p[1] = i << 1; - p += 2; - } - - p[0] = p[1] = 0; -} - -static void init_bit_trees() { - init_bit_tree(cat1, 1); - init_bit_tree(cat2, 2); - init_bit_tree(cat3, 3); - init_bit_tree(cat4, 4); - init_bit_tree(cat5, 5); - init_bit_tree(cat6, 14); -} - -const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS] = { - {0, 0, 0, 0}, // ZERO_TOKEN - {0, 0, 0, 1}, // ONE_TOKEN - {0, 0, 0, 2}, // TWO_TOKEN - {0, 0, 0, 3}, // THREE_TOKEN - {0, 0, 0, 4}, // FOUR_TOKEN - {cat1, Pcat1, 1, 5}, // DCT_VAL_CATEGORY1 - {cat2, Pcat2, 2, 7}, // DCT_VAL_CATEGORY2 - {cat3, Pcat3, 3, 11}, // DCT_VAL_CATEGORY3 - {cat4, Pcat4, 4, 19}, // DCT_VAL_CATEGORY4 - {cat5, Pcat5, 5, 35}, // DCT_VAL_CATEGORY5 - {cat6, Pcat6, 14, 67}, // DCT_VAL_CATEGORY6 - {0, 0, 0, 0} // DCT_EOB_TOKEN -}; - #include "vp9/common/vp9_default_coef_probs.h" void vp9_default_coef_probs(VP9_COMMON *cm) { @@ -492,11 +412,6 @@ void vp9_default_coef_probs(VP9_COMMON *cm) { vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32); } -void vp9_coef_tree_initialize() { - init_bit_trees(); - vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree); -} - #define COEF_COUNT_SAT 24 #define COEF_MAX_UPDATE_FACTOR 112 #define COEF_COUNT_SAT_KEY 24 diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h index 92a6c592a..941b251c3 100644 --- a/vp9/common/vp9_entropy.h +++ b/vp9/common/vp9_entropy.h @@ -44,15 +44,9 @@ extern DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]); -extern const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)]; - -extern const vp9_tree_index vp9_coef_con_tree[]; - #define DCT_EOB_MODEL_TOKEN 3 /* EOB Extra Bits 0+0 */ extern const vp9_tree_index vp9_coefmodel_tree[]; -extern struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS]; - typedef struct { const vp9_tree_index *tree; const vp9_prob *prob; @@ -105,8 +99,6 @@ typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] struct VP9Common; void vp9_default_coef_probs(struct VP9Common *cm); - -void vp9_coef_tree_initialize(); void vp9_adapt_coef_probs(struct VP9Common *cm); static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) { diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c index 5d74c6967..265242129 100644 --- a/vp9/common/vp9_entropymode.c +++ b/vp9/common/vp9_entropymode.c @@ -232,21 +232,18 @@ const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = { -D63_PRED, 16, /* 7 = D63_NODE */ -D153_PRED, -D207_PRED /* 8 = D153_NODE */ }; -struct vp9_token vp9_intra_mode_encodings[INTRA_MODES]; const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = { -INTER_OFFSET(ZEROMV), 2, -INTER_OFFSET(NEARESTMV), 4, -INTER_OFFSET(NEARMV), -INTER_OFFSET(NEWMV) }; -struct vp9_token vp9_inter_mode_encodings[INTER_MODES]; const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = { -PARTITION_NONE, 2, -PARTITION_HORZ, 4, -PARTITION_VERT, -PARTITION_SPLIT }; -struct vp9_token vp9_partition_encodings[PARTITION_TYPES]; static const vp9_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = { 9, 102, 187, 225 @@ -337,15 +334,6 @@ const vp9_tree_index vp9_switchable_interp_tree -EIGHTTAP, 2, -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP }; -struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS]; - -void vp9_entropy_mode_init() { - vp9_tokens_from_tree(vp9_intra_mode_encodings, vp9_intra_mode_tree); - vp9_tokens_from_tree(vp9_switchable_interp_encodings, - vp9_switchable_interp_tree); - vp9_tokens_from_tree(vp9_partition_encodings, vp9_partition_tree); - vp9_tokens_from_tree(vp9_inter_mode_encodings, vp9_inter_mode_tree); -} #define COUNT_SAT 20 #define MAX_UPDATE_FACTOR 128 diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h index 38b419948..df58bea3c 100644 --- a/vp9/common/vp9_entropymode.h +++ b/vp9/common/vp9_entropymode.h @@ -37,24 +37,13 @@ struct tx_counts { extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1]; extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES] [INTRA_MODES - 1]; - extern const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS] [PARTITION_TYPES - 1]; - extern const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)]; -extern struct vp9_token vp9_intra_mode_encodings[INTRA_MODES]; - extern const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)]; -extern struct vp9_token vp9_inter_mode_encodings[INTER_MODES]; - extern const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)]; -extern struct vp9_token vp9_partition_encodings[PARTITION_TYPES]; - extern const vp9_tree_index vp9_switchable_interp_tree [TREE_SIZE(SWITCHABLE_FILTERS)]; -extern struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS]; - -void vp9_entropy_mode_init(); void vp9_setup_past_independence(struct VP9Common *cm); diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h index 9e4117e17..34411a34f 100644 --- a/vp9/common/vp9_enums.h +++ b/vp9/common/vp9_enums.h @@ -52,20 +52,22 @@ typedef enum PARTITION_TYPE { #define PARTITION_PLOFFSET 4 // number of probability models per block size #define PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET) +// block transform size typedef enum { - TX_4X4 = 0, // 4x4 dct transform - TX_8X8 = 1, // 8x8 dct transform - TX_16X16 = 2, // 16x16 dct transform - TX_32X32 = 3, // 32x32 dct transform + TX_4X4 = 0, // 4x4 transform + TX_8X8 = 1, // 8x8 transform + TX_16X16 = 2, // 16x16 transform + TX_32X32 = 3, // 32x32 transform TX_SIZES } TX_SIZE; +// frame transform mode typedef enum { - ONLY_4X4 = 0, - ALLOW_8X8 = 1, - ALLOW_16X16 = 2, - ALLOW_32X32 = 3, - TX_MODE_SELECT = 4, + ONLY_4X4 = 0, // only 4x4 transform used + ALLOW_8X8 = 1, // allow block transform size up to 8x8 + ALLOW_16X16 = 2, // allow block transform size up to 16x16 + ALLOW_32X32 = 3, // allow block transform size up to 32x32 + TX_MODE_SELECT = 4, // transform specified for each block TX_MODES = 5, } TX_MODE; diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index 0b48de2cb..ff2bc45e4 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -353,29 +353,17 @@ static void filter_selectively_vert_row2(PLANE_TYPE plane_type, // TODO(yunqingwang): count in loopfilter functions should be removed. if (mask & 1) { - if ((mask_16x16_0 | mask_16x16_1) & 1) { - if ((mask_16x16_0 & mask_16x16_1) & 1) { - // TODO(yunqingwang): Combine 2 calls as 1 wide filtering. - vp9_mb_lpf_vertical_edge_w(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr); - vp9_mb_lpf_vertical_edge_w(s + 8 *pitch, pitch, lfi1->mblim, - lfi1->lim, lfi1->hev_thr); - } else if (mask_16x16_0 & 1) { - vp9_mb_lpf_vertical_edge_w(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr); - } else { - vp9_mb_lpf_vertical_edge_w(s + 8 *pitch, pitch, lfi1->mblim, - lfi1->lim, lfi1->hev_thr); - } + if (mask_16x16_0 & 1) { + // if (mask_16x16_0 & 1) is 1, then (mask_16x16_1 & 1) is 1. + vp9_mb_lpf_vertical_edge_w_16(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr); } if ((mask_8x8_0 | mask_8x8_1) & 1) { if ((mask_8x8_0 & mask_8x8_1) & 1) { - // TODO(yunqingwang): Combine 2 calls as 1 wide filtering. - vp9_mbloop_filter_vertical_edge(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, 1); - vp9_mbloop_filter_vertical_edge(s + 8 *pitch, pitch, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, 1); + vp9_mbloop_filter_vertical_edge_16(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, + lfi1->lim, lfi1->hev_thr); } else if (mask_8x8_0 & 1) { vp9_mbloop_filter_vertical_edge(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, 1); @@ -387,11 +375,9 @@ static void filter_selectively_vert_row2(PLANE_TYPE plane_type, if ((mask_4x4_0 | mask_4x4_1) & 1) { if ((mask_4x4_0 & mask_4x4_1) & 1) { - // TODO(yunqingwang): Combine 2 calls as 1 wide filtering. - vp9_loop_filter_vertical_edge(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, 1); - vp9_loop_filter_vertical_edge(s + 8 *pitch, pitch, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, 1); + vp9_loop_filter_vertical_edge_16(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, + lfi1->lim, lfi1->hev_thr); } else if (mask_4x4_0 & 1) { vp9_loop_filter_vertical_edge(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, 1); @@ -403,11 +389,9 @@ static void filter_selectively_vert_row2(PLANE_TYPE plane_type, if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) { if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) { - // TODO(yunqingwang): Combine 2 calls as 1 wide filtering. - vp9_loop_filter_vertical_edge(s + 4, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, 1); - vp9_loop_filter_vertical_edge(s + 8 *pitch + 4, pitch, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, 1); + vp9_loop_filter_vertical_edge_16(s + 4, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, + lfi1->lim, lfi1->hev_thr); } else if (mask_4x4_int_0 & 1) { vp9_loop_filter_vertical_edge(s + 4, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, 1); @@ -448,14 +432,10 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, count = 1; if (mask & 1) { if (mask_16x16 & 1) { - if ((mask_16x16 & 3) == 3) { - vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 2); - count = 2; - } else { - vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); - } + // If (mask_16x16 & 1) is 1, then (mask_16x16 & 3) is 3. + vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 2); + count = 2; } else if (mask_8x8 & 1) { if ((mask_8x8 & 3) == 3) { // Next block's thresholds diff --git a/vp9/common/vp9_loopfilter_filters.c b/vp9/common/vp9_loopfilter_filters.c index 9edf8701f..ef8de2010 100644 --- a/vp9/common/vp9_loopfilter_filters.c +++ b/vp9/common/vp9_loopfilter_filters.c @@ -169,6 +169,34 @@ void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, } } +void vp9_loop_filter_vertical_edge_16_c(uint8_t *s, int pitch, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + int i, j; + const uint8_t *blimit = blimit0; + const uint8_t *limit = limit0; + const uint8_t *thresh = thresh0; + + for (i = 0; i < 2; ++i) { + for (j = 0; j < 8; ++j) { + const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = filter_mask(*limit, *blimit, + p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1); + filter4(mask, hev, s - 2, s - 1, s, s + 1); + s += pitch; + } + blimit = blimit1; + limit = limit1; + thresh = thresh1; + } +} + static INLINE void filter8(int8_t mask, uint8_t hev, uint8_t flat, uint8_t *op3, uint8_t *op2, uint8_t *op1, uint8_t *op0, @@ -264,6 +292,36 @@ void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, } } +void vp9_mbloop_filter_vertical_edge_16_c(uint8_t *s, int pitch, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + int i, j; + const uint8_t *blimit = blimit0; + const uint8_t *limit = limit0; + const uint8_t *thresh = thresh0; + + for (i = 0; i < 2; ++i) { + for (j = 0; j < 8; ++j) { + const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = filter_mask(*limit, *blimit, + p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t hev = hev_mask(thresh[0], p1, p0, q0, q1); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + filter8(mask, hev, flat, s - 4, s - 3, s - 2, s - 1, + s, s + 1, s + 2, s + 3); + s += pitch; + } + blimit = blimit1; + limit = limit1; + thresh = thresh1; + } +} + static INLINE void filter16(int8_t mask, uint8_t hev, uint8_t flat, uint8_t flat2, uint8_t *op7, uint8_t *op6, @@ -366,3 +424,26 @@ void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int p, s += p; } } + +void vp9_mb_lpf_vertical_edge_w_16_c(uint8_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + int i; + + for (i = 0; i < 16; ++i) { + const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = filter_mask(*limit, *blimit, + p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0, + q0, s[4], s[5], s[6], s[7]); + + filter16(mask, hev, flat, flat2, + s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1, + s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7); + s += p; + } +} diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index e18e757c1..627ea31ed 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -193,12 +193,21 @@ specialize vp9_dc_128_predictor_32x32 prototype void vp9_mb_lpf_vertical_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh" specialize vp9_mb_lpf_vertical_edge_w sse2 neon dspr2 +prototype void vp9_mb_lpf_vertical_edge_w_16 "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh" +specialize vp9_mb_lpf_vertical_edge_w_16 sse2 neon dspr2 + prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" specialize vp9_mbloop_filter_vertical_edge sse2 neon dspr2 +prototype void vp9_mbloop_filter_vertical_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1" +specialize vp9_mbloop_filter_vertical_edge_16 sse2 neon dspr2 + prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" specialize vp9_loop_filter_vertical_edge mmx neon dspr2 +prototype void vp9_loop_filter_vertical_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1" +specialize vp9_loop_filter_vertical_edge_16 sse2 neon dspr2 + prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" specialize vp9_mb_lpf_horizontal_edge_w sse2 avx2 neon dspr2 @@ -206,13 +215,13 @@ prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const u specialize vp9_mbloop_filter_horizontal_edge sse2 neon dspr2 prototype void vp9_mbloop_filter_horizontal_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1" -specialize vp9_mbloop_filter_horizontal_edge_16 sse2 neon +specialize vp9_mbloop_filter_horizontal_edge_16 sse2 neon dspr2 prototype void vp9_loop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" specialize vp9_loop_filter_horizontal_edge mmx neon dspr2 prototype void vp9_loop_filter_horizontal_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1" -specialize vp9_loop_filter_horizontal_edge_16 sse2 neon +specialize vp9_loop_filter_horizontal_edge_16 sse2 neon dspr2 # # post proc @@ -698,31 +707,31 @@ fi # fdct functions prototype void vp9_short_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type" -specialize vp9_short_fht4x4 sse2 +specialize vp9_short_fht4x4 sse2 avx2 prototype void vp9_short_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type" -specialize vp9_short_fht8x8 sse2 +specialize vp9_short_fht8x8 sse2 avx2 prototype void vp9_short_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type" -specialize vp9_short_fht16x16 sse2 +specialize vp9_short_fht16x16 sse2 avx2 prototype void vp9_fwht4x4 "const int16_t *input, int16_t *output, int stride" specialize vp9_fwht4x4 prototype void vp9_fdct4x4 "const int16_t *input, int16_t *output, int stride" -specialize vp9_fdct4x4 sse2 +specialize vp9_fdct4x4 sse2 avx2 prototype void vp9_fdct8x8 "const int16_t *input, int16_t *output, int stride" -specialize vp9_fdct8x8 sse2 +specialize vp9_fdct8x8 sse2 avx2 prototype void vp9_fdct16x16 "const int16_t *input, int16_t *output, int stride" -specialize vp9_fdct16x16 sse2 +specialize vp9_fdct16x16 sse2 avx2 prototype void vp9_fdct32x32 "const int16_t *input, int16_t *output, int stride" -specialize vp9_fdct32x32 sse2 +specialize vp9_fdct32x32 sse2 avx2 prototype void vp9_fdct32x32_rd "const int16_t *input, int16_t *output, int stride" -specialize vp9_fdct32x32_rd sse2 +specialize vp9_fdct32x32_rd sse2 avx2 # # Motion search diff --git a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c index 925f74d19..3ca55cfc3 100644 --- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c +++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include <emmintrin.h> /* SSE2 */ +#include <emmintrin.h> // SSE2 #include "vp9/common/vp9_loopfilter.h" #include "vpx_ports/emmintrin_compat.h" @@ -99,7 +99,7 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s, filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); - /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ + // (vp9_filter + 3 * (qs0 - ps0)) & mask filt = _mm_and_si128(filt, mask); filter1 = _mm_adds_epi8(filt, t4); @@ -110,11 +110,11 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s, filter2 = _mm_unpacklo_epi8(zero, filter2); filter2 = _mm_srai_epi16(filter2, 0xB); - /* Filter1 >> 3 */ + // Filter1 >> 3 filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); - /* filt >> 1 */ + // filt >> 1 filt = _mm_adds_epi16(filter1, t1); filt = _mm_srai_epi16(filt, 1); filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), @@ -473,13 +473,13 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); - /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ + // (vp9_filter + 3 * (qs0 - ps0)) & mask filt = _mm_and_si128(filt, mask); filter1 = _mm_adds_epi8(filt, t4); filter2 = _mm_adds_epi8(filt, t3); - /* Filter1 >> 3 */ + // Filter1 >> 3 work_a = _mm_cmpgt_epi8(zero, filter1); filter1 = _mm_srli_epi16(filter1, 3); work_a = _mm_and_si128(work_a, te0); @@ -487,7 +487,7 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, filter1 = _mm_or_si128(filter1, work_a); qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); - /* Filter2 >> 3 */ + // Filter2 >> 3 work_a = _mm_cmpgt_epi8(zero, filter2); filter2 = _mm_srli_epi16(filter2, 3); work_a = _mm_and_si128(work_a, te0); @@ -495,7 +495,7 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, filter2 = _mm_or_si128(filter2, work_a); ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); - /* filt >> 1 */ + // filt >> 1 filt = _mm_adds_epi8(filter1, t1); work_a = _mm_cmpgt_epi8(zero, filt); filt = _mm_srli_epi16(filt, 1); @@ -1014,23 +1014,23 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); - /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ + // (vp9_filter + 3 * (qs0 - ps0)) & mask filt = _mm_and_si128(filt, mask); filter1 = _mm_adds_epi8(filt, t4); filter2 = _mm_adds_epi8(filt, t3); - /* Filter1 >> 3 */ + // Filter1 >> 3 filter1 = _mm_unpacklo_epi8(zero, filter1); filter1 = _mm_srai_epi16(filter1, 11); filter1 = _mm_packs_epi16(filter1, filter1); - /* Filter2 >> 3 */ + // Filter2 >> 3 filter2 = _mm_unpacklo_epi8(zero, filter2); filter2 = _mm_srai_epi16(filter2, 11); filter2 = _mm_packs_epi16(filter2, zero); - /* filt >> 1 */ + // filt >> 1 filt = _mm_adds_epi8(filter1, t1); filt = _mm_unpacklo_epi8(zero, filt); filt = _mm_srai_epi16(filt, 9); @@ -1083,7 +1083,7 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, } } -void vp9_mbloop_filter_horizontal_edge_16_sse2(uint8_t *s, int p /* pitch */, +void vp9_mbloop_filter_horizontal_edge_16_sse2(uint8_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0, @@ -1255,27 +1255,27 @@ void vp9_mbloop_filter_horizontal_edge_16_sse2(uint8_t *s, int p /* pitch */, filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); - /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ + // (vp9_filter + 3 * (qs0 - ps0)) & mask filt = _mm_and_si128(filt, mask); filter1 = _mm_adds_epi8(filt, t4); filter2 = _mm_adds_epi8(filt, t3); - /* Filter1 >> 3 */ + // Filter1 >> 3 work_a = _mm_cmpgt_epi8(zero, filter1); filter1 = _mm_srli_epi16(filter1, 3); work_a = _mm_and_si128(work_a, te0); filter1 = _mm_and_si128(filter1, t1f); filter1 = _mm_or_si128(filter1, work_a); - /* Filter2 >> 3 */ + // Filter2 >> 3 work_a = _mm_cmpgt_epi8(zero, filter2); filter2 = _mm_srli_epi16(filter2, 3); work_a = _mm_and_si128(work_a, te0); filter2 = _mm_and_si128(filter2, t1f); filter2 = _mm_or_si128(filter2, work_a); - /* filt >> 1 */ + // filt >> 1 filt = _mm_adds_epi8(filter1, t1); work_a = _mm_cmpgt_epi8(zero, filt); filt = _mm_srli_epi16(filt, 1); @@ -1427,27 +1427,27 @@ void vp9_loop_filter_horizontal_edge_16_sse2(unsigned char *s, filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); - /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ + // (vp9_filter + 3 * (qs0 - ps0)) & mask filt = _mm_and_si128(filt, mask); filter1 = _mm_adds_epi8(filt, t4); filter2 = _mm_adds_epi8(filt, t3); - /* Filter1 >> 3 */ + // Filter1 >> 3 work_a = _mm_cmpgt_epi8(zero, filter1); filter1 = _mm_srli_epi16(filter1, 3); work_a = _mm_and_si128(work_a, te0); filter1 = _mm_and_si128(filter1, t1f); filter1 = _mm_or_si128(filter1, work_a); - /* Filter2 >> 3 */ + // Filter2 >> 3 work_a = _mm_cmpgt_epi8(zero, filter2); filter2 = _mm_srli_epi16(filter2, 3); work_a = _mm_and_si128(work_a, te0); filter2 = _mm_and_si128(filter2, t1f); filter2 = _mm_or_si128(filter2, work_a); - /* filt >> 1 */ + // filt >> 1 filt = _mm_adds_epi8(filter1, t1); work_a = _mm_cmpgt_epi8(zero, filt); filt = _mm_srli_epi16(filt, 1); @@ -1474,7 +1474,7 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1, __m128i x0, x1, x2, x3, x4, x5, x6, x7; __m128i x8, x9, x10, x11, x12, x13, x14, x15; - /* Read in 16 lines */ + // Read in 16 lines x0 = _mm_loadl_epi64((__m128i *)in0); x8 = _mm_loadl_epi64((__m128i *)in1); x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); @@ -1512,7 +1512,7 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1, x14 = _mm_unpacklo_epi32(x12, x13); x15 = _mm_unpackhi_epi32(x12, x13); - /* Store first 4-line result */ + // Store first 4-line result _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14)); _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14)); _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15)); @@ -1528,7 +1528,7 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1, x14 = _mm_unpacklo_epi32(x12, x13); x15 = _mm_unpackhi_epi32(x12, x13); - /* Store second 4-line result */ + // Store second 4-line result _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14)); _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14)); _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15)); @@ -1598,61 +1598,129 @@ static INLINE void transpose(unsigned char *src[], int in_p, } while (++idx8x8 < num_8x8_to_transpose); } -void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s, - int p, +void vp9_loop_filter_vertical_edge_16_sse2(uint8_t *s, int p, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8); + unsigned char *src[2]; + unsigned char *dst[2]; + + // Transpose 8x16 + transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + + // Loop filtering + vp9_loop_filter_horizontal_edge_16_sse2(t_dst + 4 * 16, 16, blimit0, limit0, + thresh0, blimit1, limit1, thresh1); + src[0] = t_dst; + src[1] = t_dst + 8; + dst[0] = s - 4; + dst[1] = s - 4 + p * 8; + + // Transpose back + transpose(src, 16, dst, p, 2); +} + +void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s, int p, const unsigned char *blimit, const unsigned char *limit, const unsigned char *thresh, int count) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256); + DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 8); + unsigned char *src[1]; + unsigned char *dst[1]; + (void)count; + + // Transpose 8x8 + src[0] = s - 4; + dst[0] = t_dst; + + transpose(src, p, dst, 8, 1); + + // Loop filtering + vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 4 * 8, 8, blimit, limit, + thresh, 1); + + src[0] = t_dst; + dst[0] = s - 4; + + // Transpose back + transpose(src, 8, dst, p, 1); +} + +void vp9_mbloop_filter_vertical_edge_16_sse2(uint8_t *s, int p, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8); unsigned char *src[2]; unsigned char *dst[2]; - (void)count; - /* Transpose 16x16 */ - transpose8x16(s - 8, s - 8 + p * 8, p, t_dst, 16); - transpose8x16(s, s + p * 8, p, t_dst + 16 * 8, 16); + // Transpose 8x16 + transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); - /* Loop filtering */ - vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit, - thresh, 1); - src[0] = t_dst + 3 * 16; - src[1] = t_dst + 3 * 16 + 8; + // Loop filtering + vp9_mbloop_filter_horizontal_edge_16_sse2(t_dst + 4 * 16, 16, blimit0, limit0, + thresh0, blimit1, limit1, thresh1); + src[0] = t_dst; + src[1] = t_dst + 8; - dst[0] = s - 5; - dst[1] = s - 5 + p * 8; + dst[0] = s - 4; + dst[1] = s - 4 + p * 8; - /* Transpose 16x8 */ + // Transpose back transpose(src, 16, dst, p, 2); } -void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s, - int p, +void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s, int p, const unsigned char *blimit, const unsigned char *limit, const unsigned char *thresh) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256); - unsigned char *src[4]; - unsigned char *dst[4]; - - dst[0] = t_dst; - dst[1] = t_dst + 8 * 16; + DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 16); + unsigned char *src[2]; + unsigned char *dst[2]; src[0] = s - 8; - src[1] = s - 8 + 8; + src[1] = s; + dst[0] = t_dst; + dst[1] = t_dst + 8 * 8; - /* Transpose 16x16 */ - transpose(src, p, dst, 16, 2); + // Transpose 16x8 + transpose(src, p, dst, 8, 2); - /* Loop filtering */ - vp9_mb_lpf_horizontal_edge_w_sse2(t_dst + 8 * 16, 16, blimit, limit, - thresh, 1); + // Loop filtering + mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, thresh); src[0] = t_dst; - src[1] = t_dst + 8 * 16; - + src[1] = t_dst + 8 * 8; dst[0] = s - 8; - dst[1] = s - 8 + 8; + dst[1] = s; - transpose(src, 16, dst, p, 2); + // Transpose back + transpose(src, 8, dst, p, 2); +} + +void vp9_mb_lpf_vertical_edge_w_16_sse2(unsigned char *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256); + + // Transpose 16x16 + transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); + transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); + + // Loop filtering + mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit, + thresh); + + // Transpose back + transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); + transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); } |