diff options
Diffstat (limited to 'vpx_dsp/x86/convolve_sse2.h')
-rw-r--r-- | vpx_dsp/x86/convolve_sse2.h | 43 |
1 files changed, 31 insertions, 12 deletions
diff --git a/vpx_dsp/x86/convolve_sse2.h b/vpx_dsp/x86/convolve_sse2.h index 81fae2951..844354639 100644 --- a/vpx_dsp/x86/convolve_sse2.h +++ b/vpx_dsp/x86/convolve_sse2.h @@ -32,10 +32,10 @@ static INLINE __m128i extract_quarter_3_epi16_sse2(const __m128i *const reg) { // Interprets src as 8-bit words, zero extends to form 16-bit words, then // multiplies with ker and add the adjacent results to form 32-bit words. // Finally adds the result from 1 and 2 together. -static INLINE __m128i multiply_add_epi8_sse2(const __m128i *const src_1, - const __m128i *const src_2, - const __m128i *const ker_1, - const __m128i *const ker_2) { +static INLINE __m128i mm_madd_add_epi8_sse2(const __m128i *const src_1, + const __m128i *const src_2, + const __m128i *const ker_1, + const __m128i *const ker_2) { const __m128i src_1_half = _mm_unpacklo_epi8(*src_1, _mm_setzero_si128()); const __m128i src_2_half = _mm_unpacklo_epi8(*src_2, _mm_setzero_si128()); const __m128i madd_1 = _mm_madd_epi16(src_1_half, *ker_1); @@ -43,25 +43,44 @@ static INLINE __m128i multiply_add_epi8_sse2(const __m128i *const src_1, return _mm_add_epi32(madd_1, madd_2); } -static INLINE __m128i multiply_add_packs_epi16_sse2(const __m128i *const src_0, - const __m128i *const src_1, - const __m128i *const ker) { +// Interprets src as 16-bit words, then multiplies with ker and add the +// adjacent results to form 32-bit words. Finally adds the result from 1 and 2 +// together. +static INLINE __m128i mm_madd_add_epi16_sse2(const __m128i *const src_1, + const __m128i *const src_2, + const __m128i *const ker_1, + const __m128i *const ker_2) { + const __m128i madd_1 = _mm_madd_epi16(*src_1, *ker_1); + const __m128i madd_2 = _mm_madd_epi16(*src_2, *ker_2); + return _mm_add_epi32(madd_1, madd_2); +} + +static INLINE __m128i mm_madd_packs_epi16_sse2(const __m128i *const src_0, + const __m128i *const src_1, + const __m128i *const ker) { const __m128i madd_1 = _mm_madd_epi16(*src_0, *ker); const __m128i madd_2 = _mm_madd_epi16(*src_1, *ker); return _mm_packs_epi32(madd_1, madd_2); } // Interleaves src_1 and src_2 -static INLINE __m128i combine_epi32_sse2(const __m128i *const src_1, - const __m128i *const src_2) { +static INLINE __m128i mm_zip_epi32_sse2(const __m128i *const src_1, + const __m128i *const src_2) { const __m128i tmp_1 = _mm_unpacklo_epi32(*src_1, *src_2); const __m128i tmp_2 = _mm_unpackhi_epi32(*src_1, *src_2); return _mm_packs_epi32(tmp_1, tmp_2); } -static INLINE __m128i round_epi16_sse2(const __m128i *const src, - const __m128i *const half_depth, - const int depth) { +static INLINE __m128i mm_round_epi32_sse2(const __m128i *const src, + const __m128i *const half_depth, + const int depth) { + const __m128i nearest_src = _mm_add_epi32(*src, *half_depth); + return _mm_srai_epi32(nearest_src, depth); +} + +static INLINE __m128i mm_round_epi16_sse2(const __m128i *const src, + const __m128i *const half_depth, + const int depth) { const __m128i nearest_src = _mm_adds_epi16(*src, *half_depth); return _mm_srai_epi16(nearest_src, depth); } |