summaryrefslogtreecommitdiff
path: root/vpx_dsp
diff options
context:
space:
mode:
authorLinfeng Zhang <linfengz@google.com>2017-10-11 11:59:04 -0700
committerLinfeng Zhang <linfengz@google.com>2017-10-16 15:42:42 -0700
commit580d32240f9a2f7783ced9b0e409de62ba07f4e1 (patch)
tree291af741b11ba297faeaba257751922f49422314 /vpx_dsp
parent1fa3ec3023ef1b841d993bc21cba32b658a790ea (diff)
downloadlibvpx-580d32240f9a2f7783ced9b0e409de62ba07f4e1.tar
libvpx-580d32240f9a2f7783ced9b0e409de62ba07f4e1.tar.gz
libvpx-580d32240f9a2f7783ced9b0e409de62ba07f4e1.tar.bz2
libvpx-580d32240f9a2f7783ced9b0e409de62ba07f4e1.zip
Add 4 to 3 scaling SSSE3 optimization
Note this change will trigger the different C version on SSSE3 and generate different scaled output. Its speed is 2x compared with the version calling vpx_scaled_2d_ssse3(). Change-Id: I17fff122cd0a5ac8aa451d84daa606582da8e194
Diffstat (limited to 'vpx_dsp')
-rw-r--r--vpx_dsp/x86/convolve_ssse3.h61
-rw-r--r--vpx_dsp/x86/mem_sse2.h8
2 files changed, 69 insertions, 0 deletions
diff --git a/vpx_dsp/x86/convolve_ssse3.h b/vpx_dsp/x86/convolve_ssse3.h
index b71da0e4e..8da28f0b2 100644
--- a/vpx_dsp/x86/convolve_ssse3.h
+++ b/vpx_dsp/x86/convolve_ssse3.h
@@ -11,6 +11,7 @@
#ifndef VPX_DSP_X86_CONVOLVE_SSSE3_H_
#define VPX_DSP_X86_CONVOLVE_SSSE3_H_
+#include <assert.h>
#include <tmmintrin.h> // SSSE3
#include "./vpx_config.h"
@@ -25,6 +26,20 @@ static INLINE void shuffle_filter_ssse3(const int16_t *const filter,
f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
}
+static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter,
+ __m128i *const f) {
+ const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+ // pack and duplicate the filter values
+ // It utilizes the fact that the high byte of filter[3] is always 0 to clean
+ // half of f[0] and f[4].
+ assert(filter[3] >= 0 && filter[3] < 256);
+ f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u));
+ f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u));
+ f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u));
+ f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au));
+ f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu));
+}
+
static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
const __m128i *const f) {
// multiply 2 adjacent elements with the filter and add the result
@@ -45,4 +60,50 @@ static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
return temp;
}
+static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,
+ const __m128i *const f) {
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i k_64 = _mm_set1_epi16(1 << 6);
+ const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+ const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+ const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+ const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+ // compensate the subtracted 64 in f[1]. x4 is always non negative.
+ const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64));
+ // add and saturate the results together
+ __m128i temp = _mm_adds_epi16(x0, x3);
+ temp = _mm_adds_epi16(temp, x1);
+ temp = _mm_adds_epi16(temp, x2);
+ temp = _mm_adds_epi16(temp, x4);
+ // round and shift by 7 bit each 16 bit
+ temp = _mm_adds_epi16(temp, k_64);
+ temp = _mm_srai_epi16(temp, 7);
+ return temp;
+}
+
+static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s,
+ const __m128i *const f) {
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i k_64 = _mm_set1_epi16(1 << 6);
+ const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+ const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+ const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+ const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+ const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]);
+ // compensate the subtracted 64 in f[2]. x5 is always non negative.
+ const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64));
+ __m128i temp;
+
+ // add and saturate the results together
+ temp = _mm_adds_epi16(x0, x1);
+ temp = _mm_adds_epi16(temp, x2);
+ temp = _mm_adds_epi16(temp, x3);
+ temp = _mm_adds_epi16(temp, x4);
+ temp = _mm_adds_epi16(temp, x5);
+ // round and shift by 7 bit each 16 bit
+ temp = _mm_adds_epi16(temp, k_64);
+ temp = _mm_srai_epi16(temp, 7);
+ return temp;
+}
+
#endif // VPX_DSP_X86_CONVOLVE_SSSE3_H_
diff --git a/vpx_dsp/x86/mem_sse2.h b/vpx_dsp/x86/mem_sse2.h
index f9f0a48a0..2ce738fb7 100644
--- a/vpx_dsp/x86/mem_sse2.h
+++ b/vpx_dsp/x86/mem_sse2.h
@@ -113,4 +113,12 @@ static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d,
_mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]);
}
+static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d,
+ const ptrdiff_t stride) {
+ _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]);
+ _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]);
+ _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]);
+ _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]);
+}
+
#endif // VPX_DSP_X86_MEM_SSE2_H_