diff options
author | chiyotsai <chiyotsai@google.com> | 2018-10-18 09:51:56 -0700 |
---|---|---|
committer | chiyotsai <chiyotsai@google.com> | 2018-10-23 16:28:29 -0700 |
commit | 73930f97634bfd163882151a30cb52b95eb69bec (patch) | |
tree | 381f74b9cbd51d7c8e142677b57618e39b50f781 | |
parent | 137d99c91fc7f03b6b886e5ac342ee4ef7f875ec (diff) | |
download | libvpx-73930f97634bfd163882151a30cb52b95eb69bec.tar libvpx-73930f97634bfd163882151a30cb52b95eb69bec.tar.gz libvpx-73930f97634bfd163882151a30cb52b95eb69bec.tar.bz2 libvpx-73930f97634bfd163882151a30cb52b95eb69bec.zip |
Clean up vpx_dsp/x86/convolve_sse2.h
Removes unnecesssary includes and reword some functions/comments.
Change-Id: Ied557d7faa9d845d38255e6e3e0e3fe1395276e1
-rw-r--r-- | vpx_dsp/x86/convolve_sse2.h | 16 | ||||
-rw-r--r-- | vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c | 24 |
2 files changed, 21 insertions, 19 deletions
diff --git a/vpx_dsp/x86/convolve_sse2.h b/vpx_dsp/x86/convolve_sse2.h index d674cc495..81fae2951 100644 --- a/vpx_dsp/x86/convolve_sse2.h +++ b/vpx_dsp/x86/convolve_sse2.h @@ -11,7 +11,6 @@ #ifndef VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_ #define VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_ -#include <assert.h> #include <emmintrin.h> // SSE2 #include "./vpx_config.h" @@ -30,12 +29,13 @@ static INLINE __m128i extract_quarter_3_epi16_sse2(const __m128i *const reg) { return _mm_unpacklo_epi64(tmp, tmp); } -// Interprets src as 8-bit words, pads each word with zeroes to form 16-bit -// words, then multiplies with ker and add the adjacent results to form 32-bit -// words. Finally adds the result from 1 and 2 together. -static INLINE __m128i pad_multiply_add_add_epi8_sse2( - const __m128i *const src_1, const __m128i *const src_2, - const __m128i *const ker_1, const __m128i *const ker_2) { +// Interprets src as 8-bit words, zero extends to form 16-bit words, then +// multiplies with ker and add the adjacent results to form 32-bit words. +// Finally adds the result from 1 and 2 together. +static INLINE __m128i multiply_add_epi8_sse2(const __m128i *const src_1, + const __m128i *const src_2, + const __m128i *const ker_1, + const __m128i *const ker_2) { const __m128i src_1_half = _mm_unpacklo_epi8(*src_1, _mm_setzero_si128()); const __m128i src_2_half = _mm_unpacklo_epi8(*src_2, _mm_setzero_si128()); const __m128i madd_1 = _mm_madd_epi16(src_1_half, *ker_1); @@ -50,6 +50,8 @@ static INLINE __m128i multiply_add_packs_epi16_sse2(const __m128i *const src_0, const __m128i madd_2 = _mm_madd_epi16(*src_1, *ker); return _mm_packs_epi32(madd_1, madd_2); } + +// Interleaves src_1 and src_2 static INLINE __m128i combine_epi32_sse2(const __m128i *const src_1, const __m128i *const src_2) { const __m128i tmp_1 = _mm_unpacklo_epi32(*src_1, *src_2); diff --git a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c index 3518d82f0..fa223aed0 100644 --- a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c +++ b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c @@ -54,12 +54,12 @@ void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, src_reg_shift_3 = _mm_srli_si128(src_reg, 3); // Output 6 4 2 0 - even = pad_multiply_add_add_epi8_sse2(&src_reg, &src_reg_shift_2, - &kernel_reg_23, &kernel_reg_45); + even = multiply_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); // Output 7 5 3 1 - odd = pad_multiply_add_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, - &kernel_reg_23, &kernel_reg_45); + odd = multiply_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); // Combine to get the first half of the dst dst_first = combine_epi32_sse2(&even, &odd); @@ -71,12 +71,12 @@ void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, src_reg_shift_3 = _mm_srli_si128(src_reg, 3); // Output 14 12 10 8 - even = pad_multiply_add_add_epi8_sse2(&src_reg, &src_reg_shift_2, - &kernel_reg_23, &kernel_reg_45); + even = multiply_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); // Output 15 13 11 9 - odd = pad_multiply_add_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, - &kernel_reg_23, &kernel_reg_45); + odd = multiply_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); // Combine to get the second half of the dst dst_second = combine_epi32_sse2(&even, &odd); @@ -288,12 +288,12 @@ void vpx_filter_block1d8_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, src_reg_shift_3 = _mm_srli_si128(src_reg, 3); // Output 6 4 2 0 - even = pad_multiply_add_add_epi8_sse2(&src_reg, &src_reg_shift_2, - &kernel_reg_23, &kernel_reg_45); + even = multiply_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); // Output 7 5 3 1 - odd = pad_multiply_add_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, - &kernel_reg_23, &kernel_reg_45); + odd = multiply_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); // Combine to get the first half of the dst dst_first = combine_epi32_sse2(&even, &odd); |