Merge "Fix high bit-depth loop-filter sse2 compiling issue - part 4"

author: Jingning Han <jingning@google.com> 2015-02-27 09:49:10 -0800
committer: Gerrit Code Review <gerrit@gerrit.golo.chromium.org> 2015-02-27 09:49:10 -0800
commit: c4cb8059ff921e5318d018557ac557348e76a64f (patch)
tree: d1bcb2407a831457ebf6ad4fe5ad401b0af93ca2
parent: 43bb97f7d0b72eff2a00d34ecde98b9492638701 (diff)
parent: 5b87f1bb5a6996e0595b3cb78971a42ca91a97ca (diff)
download: libvpx-c4cb8059ff921e5318d018557ac557348e76a64f.tar
libvpx-c4cb8059ff921e5318d018557ac557348e76a64f.tar.gz
libvpx-c4cb8059ff921e5318d018557ac557348e76a64f.tar.bz2
libvpx-c4cb8059ff921e5318d018557ac557348e76a64f.zip
1 files changed, 48 insertions, 22 deletions
diff --git a/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c
index 04823ba9c..4bd8ac4a3 100644
--- a/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c
+++ b/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c
@@ -775,15 +775,7 @@ void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
                                       const uint8_t *_thresh,
                                       int count, int bd) {
   const __m128i zero = _mm_set1_epi16(0);
-  const __m128i blimit = _mm_slli_epi16(
-      _mm_unpacklo_epi8(
-          _mm_load_si128((const __m128i *)_blimit), zero), bd - 8);
-  const __m128i limit = _mm_slli_epi16(
-      _mm_unpacklo_epi8(
-          _mm_load_si128((const __m128i *)_limit), zero), bd - 8);
-  const __m128i thresh = _mm_slli_epi16(
-      _mm_unpacklo_epi8(
-          _mm_load_si128((const __m128i *)_thresh), zero), bd - 8);
+  __m128i blimit, limit, thresh;
   __m128i mask, hev, flat;
   __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
   __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
@@ -806,30 +798,63 @@ void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
   __m128i work;
   const __m128i t4 = _mm_set1_epi16(4);
   const __m128i t3 = _mm_set1_epi16(3);
-  const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
-  const __m128i tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), bd - 8);
-  const __m128i tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), bd - 8);
-  const __m128i t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 16 - bd);
+  __m128i t80;
+  __m128i tff80;
+  __m128i tffe0;
+  __m128i t1f;
   // equivalent to shifting 0x1f left by bitdepth - 8
   // and setting new bits to 1
   const __m128i t1 = _mm_set1_epi16(0x1);
-  const __m128i t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 16 - bd);
+  __m128i t7f;
   // equivalent to shifting 0x7f left by bitdepth - 8
   // and setting new bits to 1
-  const __m128i ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)),
-                                     t80);
-  const __m128i ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)),
-                                     t80);
-  const __m128i qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)),
-                                     t80);
-  const __m128i qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)),
-                                     t80);
+  __m128i ps1, ps0, qs0, qs1;
   __m128i filt;
   __m128i work_a;
   __m128i filter1, filter2;
 
   (void)count;
 
+  if (bd == 8) {
+    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
+    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
+    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+    t80 = _mm_set1_epi16(0x80);
+    tff80 = _mm_set1_epi16(0xff80);
+    tffe0 = _mm_set1_epi16(0xffe0);
+    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
+    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
+  } else if (bd == 10) {
+    blimit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
+    limit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
+    thresh = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+    t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
+    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2);
+    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2);
+    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
+    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
+  } else {  // bd == 12
+    blimit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
+    limit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
+    thresh = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+    t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
+    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4);
+    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4);
+    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4);
+    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
+  }
+
+  ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
+  ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
+  qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
+  qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+
   // filter_mask and hev_mask
   flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
   hev = _mm_subs_epu16(flat, thresh);
@@ -865,6 +890,7 @@ void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
   filt = _mm_adds_epi16(filt, work_a);
   filt = _mm_adds_epi16(filt, work_a);
   filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
+
   // (vp9_filter + 3 * (qs0 - ps0)) & mask
   filt = _mm_and_si128(filt, mask);
author	Jingning Han <jingning@google.com>	2015-02-27 09:49:10 -0800
committer	Gerrit Code Review <gerrit@gerrit.golo.chromium.org>	2015-02-27 09:49:10 -0800
commit	c4cb8059ff921e5318d018557ac557348e76a64f (patch)
tree	d1bcb2407a831457ebf6ad4fe5ad401b0af93ca2
parent	43bb97f7d0b72eff2a00d34ecde98b9492638701 (diff)
parent	5b87f1bb5a6996e0595b3cb78971a42ca91a97ca (diff)
download	libvpx-c4cb8059ff921e5318d018557ac557348e76a64f.tar libvpx-c4cb8059ff921e5318d018557ac557348e76a64f.tar.gz libvpx-c4cb8059ff921e5318d018557ac557348e76a64f.tar.bz2 libvpx-c4cb8059ff921e5318d018557ac557348e76a64f.zip