Optimize convolve8 SSSE3 and AVX2 intrinsics

Changed the intrinsics to perform summation similiar to the way the assembly does. The new code diverges from the assembly by preferring unsaturated additions. Results for haswell SSSE3 Horiz/Vert Size Speedup Horiz x4 ~32% Horiz x8 ~6% Vert x8 ~4% AVX2 Horiz/Vert Size Speedup Horiz x16 ~16% Vert x16 ~14% BUG=webm:1471 Change-Id: I7ad98ea688c904b1ba324adf8eb977873c8b8668
author: Kyle Siefring <kylesiefring@gmail.com> 2017-10-22 19:34:19 -0400
committer: Kyle Siefring <kylesiefring@gmail.com> 2017-10-24 10:39:48 -0400
commit: ae35425ae64a3d9573f85a4a92c5638a58044057 (patch)
tree: d92525876e018c38873e281dd5045f72af1f11be /vpx_dsp
parent: b3a36f7946f930caa0e96448648db60d7330c98d (diff)
download: libvpx-ae35425ae64a3d9573f85a4a92c5638a58044057.tar
libvpx-ae35425ae64a3d9573f85a4a92c5638a58044057.tar.gz
libvpx-ae35425ae64a3d9573f85a4a92c5638a58044057.tar.bz2
libvpx-ae35425ae64a3d9573f85a4a92c5638a58044057.zip
3 files changed, 48 insertions, 41 deletions
diff --git a/vpx_dsp/x86/convolve_avx2.h b/vpx_dsp/x86/convolve_avx2.h
index c2e83b53f..bc96b738f 100644
--- a/vpx_dsp/x86/convolve_avx2.h
+++ b/vpx_dsp/x86/convolve_avx2.h
@@ -58,16 +58,19 @@ static INLINE __m256i convolve8_16_avx2(const __m256i *const s,
   const __m256i x1 = _mm256_maddubs_epi16(s[1], f[1]);
   const __m256i x2 = _mm256_maddubs_epi16(s[2], f[2]);
   const __m256i x3 = _mm256_maddubs_epi16(s[3], f[3]);
-  // add and saturate the results together
-  const __m256i min_x2x1 = _mm256_min_epi16(x2, x1);
-  const __m256i max_x2x1 = _mm256_max_epi16(x2, x1);
-  __m256i temp = _mm256_adds_epi16(x0, x3);
-  temp = _mm256_adds_epi16(temp, min_x2x1);
-  temp = _mm256_adds_epi16(temp, max_x2x1);
+  __m256i sum1, sum2;
+
+  // sum the results together, saturating only on the final step
+  // adding x0 with x2 and x1 with x3 is the only order that prevents
+  // outranges for all filters
+  sum1 = _mm256_add_epi16(x0, x2);
+  sum2 = _mm256_add_epi16(x1, x3);
+  // add the rounding offset early to avoid another saturated add
+  sum1 = _mm256_add_epi16(sum1, k_64);
+  sum1 = _mm256_adds_epi16(sum1, sum2);
   // round and shift by 7 bit each 16 bit
-  temp = _mm256_adds_epi16(temp, k_64);
-  temp = _mm256_srai_epi16(temp, 7);
-  return temp;
+  sum1 = _mm256_srai_epi16(sum1, 7);
+  return sum1;
 }
 
 static INLINE __m128i convolve8_8_avx2(const __m256i *const s,
@@ -82,16 +85,19 @@ static INLINE __m128i convolve8_8_avx2(const __m256i *const s,
                                        _mm256_castsi256_si128(f[2]));
   const __m128i x3 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[3]),
                                        _mm256_castsi256_si128(f[3]));
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_adds_epi16(temp, k_64);
-  temp = _mm_srai_epi16(temp, 7);
-  return temp;
+  __m128i sum1, sum2;
+
+  // sum the results together, saturating only on the final step
+  // adding x0 with x2 and x1 with x3 is the only order that prevents
+  // outranges for all filters
+  sum1 = _mm_add_epi16(x0, x2);
+  sum2 = _mm_add_epi16(x1, x3);
+  // add the rounding offset early to avoid another saturated add
+  sum1 = _mm_add_epi16(sum1, k_64);
+  sum1 = _mm_adds_epi16(sum1, sum2);
+  // shift by 7 bit each 16 bit
+  sum1 = _mm_srai_epi16(sum1, 7);
+  return sum1;
 }
 
 #undef MM256_BROADCASTSI128_SI256
diff --git a/vpx_dsp/x86/convolve_ssse3.h b/vpx_dsp/x86/convolve_ssse3.h
index 8da28f0b2..e5d452f99 100644
--- a/vpx_dsp/x86/convolve_ssse3.h
+++ b/vpx_dsp/x86/convolve_ssse3.h
@@ -48,16 +48,19 @@ static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
   const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
   const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
   const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_adds_epi16(temp, k_64);
-  temp = _mm_srai_epi16(temp, 7);
-  return temp;
+  __m128i sum1, sum2;
+
+  // sum the results together, saturating only on the final step
+  // adding x0 with x2 and x1 with x3 is the only order that prevents
+  // outranges for all filters
+  sum1 = _mm_add_epi16(x0, x2);
+  sum2 = _mm_add_epi16(x1, x3);
+  // add the rounding offset early to avoid another saturated add
+  sum1 = _mm_add_epi16(sum1, k_64);
+  sum1 = _mm_adds_epi16(sum1, sum2);
+  // shift by 7 bit each 16 bit
+  sum1 = _mm_srai_epi16(sum1, 7);
+  return sum1;
 }
 
 static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index 5a94c69b5..5b16022d4 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -38,8 +38,8 @@ void vpx_filter_block1d4_h8_intrin_ssse3(
     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
   __m128i firstFilters, secondFilters, shuffle1, shuffle2;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-  __m128i addFilterReg64, filtersReg, srcReg, minReg;
+  __m128i srcRegFilt1, srcRegFilt2;
+  __m128i addFilterReg64, filtersReg, srcReg;
   unsigned int i;
 
   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
@@ -75,18 +75,16 @@ void vpx_filter_block1d4_h8_intrin_ssse3(
     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
 
-    // extract the higher half of the lane
-    srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
-    srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
+    // sum the results together, saturating only on the final step
+    // the specific order of the additions prevents outranges
+    srcRegFilt1 = _mm_add_epi16(srcRegFilt1, srcRegFilt2);
 
-    minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
+    // extract the higher half of the register
+    srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8);
 
-    // add and saturate all the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-    srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+    // add the rounding offset early to avoid another saturated add
+    srcRegFilt1 = _mm_add_epi16(srcRegFilt1, addFilterReg64);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
 
     // shift by 7 bit each 16 bits
     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
author	Kyle Siefring <kylesiefring@gmail.com>	2017-10-22 19:34:19 -0400
committer	Kyle Siefring <kylesiefring@gmail.com>	2017-10-24 10:39:48 -0400
commit	ae35425ae64a3d9573f85a4a92c5638a58044057 (patch)
tree	d92525876e018c38873e281dd5045f72af1f11be /vpx_dsp
parent	b3a36f7946f930caa0e96448648db60d7330c98d (diff)
download	libvpx-ae35425ae64a3d9573f85a4a92c5638a58044057.tar libvpx-ae35425ae64a3d9573f85a4a92c5638a58044057.tar.gz libvpx-ae35425ae64a3d9573f85a4a92c5638a58044057.tar.bz2 libvpx-ae35425ae64a3d9573f85a4a92c5638a58044057.zip