Add 4 to 3 scaling SSSE3 optimization

Note this change will trigger the different C version on SSSE3 and generate different scaled output. Its speed is 2x compared with the version calling vpx_scaled_2d_ssse3(). Change-Id: I17fff122cd0a5ac8aa451d84daa606582da8e194
author: Linfeng Zhang <linfengz@google.com> 2017-10-11 11:59:04 -0700
committer: Linfeng Zhang <linfengz@google.com> 2017-10-16 15:42:42 -0700
commit: 580d32240f9a2f7783ced9b0e409de62ba07f4e1 (patch)
tree: 291af741b11ba297faeaba257751922f49422314 /vpx_dsp
parent: 1fa3ec3023ef1b841d993bc21cba32b658a790ea (diff)
download: libvpx-580d32240f9a2f7783ced9b0e409de62ba07f4e1.tar
libvpx-580d32240f9a2f7783ced9b0e409de62ba07f4e1.tar.gz
libvpx-580d32240f9a2f7783ced9b0e409de62ba07f4e1.tar.bz2
libvpx-580d32240f9a2f7783ced9b0e409de62ba07f4e1.zip
2 files changed, 69 insertions, 0 deletions
diff --git a/vpx_dsp/x86/convolve_ssse3.h b/vpx_dsp/x86/convolve_ssse3.h
index b71da0e4e..8da28f0b2 100644
--- a/vpx_dsp/x86/convolve_ssse3.h
+++ b/vpx_dsp/x86/convolve_ssse3.h
@@ -11,6 +11,7 @@
 #ifndef VPX_DSP_X86_CONVOLVE_SSSE3_H_
 #define VPX_DSP_X86_CONVOLVE_SSSE3_H_
 
+#include <assert.h>
 #include <tmmintrin.h>  // SSSE3
 
 #include "./vpx_config.h"
@@ -25,6 +26,20 @@ static INLINE void shuffle_filter_ssse3(const int16_t *const filter,
   f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
 }
 
+static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter,
+                                            __m128i *const f) {
+  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+  // pack and duplicate the filter values
+  // It utilizes the fact that the high byte of filter[3] is always 0 to clean
+  // half of f[0] and f[4].
+  assert(filter[3] >= 0 && filter[3] < 256);
+  f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u));
+  f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u));
+  f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u));
+  f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au));
+  f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu));
+}
+
 static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
                                         const __m128i *const f) {
   // multiply 2 adjacent elements with the filter and add the result
@@ -45,4 +60,50 @@ static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
   return temp;
 }
 
+static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,
+                                                    const __m128i *const f) {
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+  const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+  const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+  const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+  // compensate the subtracted 64 in f[1]. x4 is always non negative.
+  const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64));
+  // add and saturate the results together
+  __m128i temp = _mm_adds_epi16(x0, x3);
+  temp = _mm_adds_epi16(temp, x1);
+  temp = _mm_adds_epi16(temp, x2);
+  temp = _mm_adds_epi16(temp, x4);
+  // round and shift by 7 bit each 16 bit
+  temp = _mm_adds_epi16(temp, k_64);
+  temp = _mm_srai_epi16(temp, 7);
+  return temp;
+}
+
+static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s,
+                                                   const __m128i *const f) {
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+  const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+  const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+  const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+  const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]);
+  // compensate the subtracted 64 in f[2]. x5 is always non negative.
+  const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64));
+  __m128i temp;
+
+  // add and saturate the results together
+  temp = _mm_adds_epi16(x0, x1);
+  temp = _mm_adds_epi16(temp, x2);
+  temp = _mm_adds_epi16(temp, x3);
+  temp = _mm_adds_epi16(temp, x4);
+  temp = _mm_adds_epi16(temp, x5);
+  // round and shift by 7 bit each 16 bit
+  temp = _mm_adds_epi16(temp, k_64);
+  temp = _mm_srai_epi16(temp, 7);
+  return temp;
+}
+
 #endif  // VPX_DSP_X86_CONVOLVE_SSSE3_H_
diff --git a/vpx_dsp/x86/mem_sse2.h b/vpx_dsp/x86/mem_sse2.h
index f9f0a48a0..2ce738fb7 100644
--- a/vpx_dsp/x86/mem_sse2.h
+++ b/vpx_dsp/x86/mem_sse2.h
@@ -113,4 +113,12 @@ static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d,
   _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]);
 }
 
+static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d,
+                                    const ptrdiff_t stride) {
+  _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]);
+  _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]);
+  _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]);
+  _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]);
+}
+
 #endif  // VPX_DSP_X86_MEM_SSE2_H_
author	Linfeng Zhang <linfengz@google.com>	2017-10-11 11:59:04 -0700
committer	Linfeng Zhang <linfengz@google.com>	2017-10-16 15:42:42 -0700
commit	580d32240f9a2f7783ced9b0e409de62ba07f4e1 (patch)
tree	291af741b11ba297faeaba257751922f49422314 /vpx_dsp
parent	1fa3ec3023ef1b841d993bc21cba32b658a790ea (diff)
download	libvpx-580d32240f9a2f7783ced9b0e409de62ba07f4e1.tar libvpx-580d32240f9a2f7783ced9b0e409de62ba07f4e1.tar.gz libvpx-580d32240f9a2f7783ced9b0e409de62ba07f4e1.tar.bz2 libvpx-580d32240f9a2f7783ced9b0e409de62ba07f4e1.zip