From 14ff1cb74af795a772e247345cf8664c6dfdba6b Mon Sep 17 00:00:00 2001 From: Jingning Han Date: Tue, 24 Feb 2015 12:21:04 -0800 Subject: Fix high bit-depth loop-filter sse2 compiling issue - part 2 Change-Id: I6728b69bb3dff1daa64ff7142f691e80a089f1c4 --- vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c | 57 ++++++++++++++++++------ 1 file changed, 44 insertions(+), 13 deletions(-) (limited to 'vp9') diff --git a/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c index 7e63f389e..62c3780eb 100644 --- a/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c +++ b/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c @@ -45,14 +45,7 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s, int bd) { const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi16(1); - const __m128i blimit = _mm_slli_epi16( - _mm_unpacklo_epi8( - _mm_load_si128((const __m128i *)_blimit), zero), bd - 8); - const __m128i limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), bd - 8); - const __m128i thresh = _mm_slli_epi16( - _mm_unpacklo_epi8( - _mm_load_si128((const __m128i *)_thresh), zero), bd - 8); + __m128i blimit, limit, thresh; __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0; __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0; __m128i ps1, qs1, ps0, qs0; @@ -68,6 +61,26 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s, __m128i t4, t3, t80, t1; __m128i eight, four; + if (bd == 8) { + blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); + limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); + thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); + } else if (bd == 10) { + blimit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); + limit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); + thresh = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); + } else { // bd == 12 + blimit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); + limit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); + thresh = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); + } + q4 = _mm_load_si128((__m128i *)(s + 4 * p)); p4 = _mm_load_si128((__m128i *)(s - 5 * p)); q3 = _mm_load_si128((__m128i *)(s + 3 * p)); @@ -121,7 +134,13 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s, // highbd_filter4 t4 = _mm_set1_epi16(4); t3 = _mm_set1_epi16(3); - t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8); + if (bd == 8) + t80 = _mm_set1_epi16(0x80); + else if (bd == 10) + t80 = _mm_set1_epi16(0x200); + else // bd == 12 + t80 = _mm_set1_epi16(0x800); + t1 = _mm_set1_epi16(0x1); ps1 = _mm_subs_epi16(p1, t80); @@ -136,7 +155,6 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s, filt = _mm_adds_epi16(filt, work_a); filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd); filt = _mm_and_si128(filt, mask); - filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd); filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd); @@ -153,13 +171,13 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s, filt = _mm_adds_epi16(filter1, t1); filt = _mm_srai_epi16(filt, 1); filt = _mm_andnot_si128(hev, filt); - qs1 = _mm_adds_epi16( signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), t80); ps1 = _mm_adds_epi16( signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), t80); + // end highbd_filter4 // loopfilter done @@ -175,7 +193,14 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s, flat = _mm_max_epi16(work, flat); work = _mm_max_epi16(abs_p1p0, abs_q1q0); flat = _mm_max_epi16(work, flat); - flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); + + if (bd == 8) + flat = _mm_subs_epu16(flat, one); + else if (bd == 10) + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2)); + else // bd == 12 + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4)); + flat = _mm_cmpeq_epi16(flat, zero); // end flat_mask4 @@ -215,7 +240,13 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s, _mm_subs_epu16(q0, q7))); flat2 = _mm_max_epi16(work, flat2); - flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, bd - 8)); + if (bd == 8) + flat2 = _mm_subs_epu16(flat2, one); + else if (bd == 10) + flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 2)); + else // bd == 12 + flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 4)); + flat2 = _mm_cmpeq_epi16(flat2, zero); flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask // end highbd_flat_mask5 -- cgit v1.2.3