diff options
Diffstat (limited to 'vpx_dsp')
-rw-r--r-- | vpx_dsp/x86/convolve_avx2.h | 44 | ||||
-rw-r--r-- | vpx_dsp/x86/convolve_ssse3.h | 23 | ||||
-rw-r--r-- | vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c | 22 |
3 files changed, 48 insertions, 41 deletions
diff --git a/vpx_dsp/x86/convolve_avx2.h b/vpx_dsp/x86/convolve_avx2.h index c2e83b53f..bc96b738f 100644 --- a/vpx_dsp/x86/convolve_avx2.h +++ b/vpx_dsp/x86/convolve_avx2.h @@ -58,16 +58,19 @@ static INLINE __m256i convolve8_16_avx2(const __m256i *const s, const __m256i x1 = _mm256_maddubs_epi16(s[1], f[1]); const __m256i x2 = _mm256_maddubs_epi16(s[2], f[2]); const __m256i x3 = _mm256_maddubs_epi16(s[3], f[3]); - // add and saturate the results together - const __m256i min_x2x1 = _mm256_min_epi16(x2, x1); - const __m256i max_x2x1 = _mm256_max_epi16(x2, x1); - __m256i temp = _mm256_adds_epi16(x0, x3); - temp = _mm256_adds_epi16(temp, min_x2x1); - temp = _mm256_adds_epi16(temp, max_x2x1); + __m256i sum1, sum2; + + // sum the results together, saturating only on the final step + // adding x0 with x2 and x1 with x3 is the only order that prevents + // outranges for all filters + sum1 = _mm256_add_epi16(x0, x2); + sum2 = _mm256_add_epi16(x1, x3); + // add the rounding offset early to avoid another saturated add + sum1 = _mm256_add_epi16(sum1, k_64); + sum1 = _mm256_adds_epi16(sum1, sum2); // round and shift by 7 bit each 16 bit - temp = _mm256_adds_epi16(temp, k_64); - temp = _mm256_srai_epi16(temp, 7); - return temp; + sum1 = _mm256_srai_epi16(sum1, 7); + return sum1; } static INLINE __m128i convolve8_8_avx2(const __m256i *const s, @@ -82,16 +85,19 @@ static INLINE __m128i convolve8_8_avx2(const __m256i *const s, _mm256_castsi256_si128(f[2])); const __m128i x3 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[3]), _mm256_castsi256_si128(f[3])); - // add and saturate the results together - const __m128i min_x2x1 = _mm_min_epi16(x2, x1); - const __m128i max_x2x1 = _mm_max_epi16(x2, x1); - __m128i temp = _mm_adds_epi16(x0, x3); - temp = _mm_adds_epi16(temp, min_x2x1); - temp = _mm_adds_epi16(temp, max_x2x1); - // round and shift by 7 bit each 16 bit - temp = _mm_adds_epi16(temp, k_64); - temp = _mm_srai_epi16(temp, 7); - return temp; + __m128i sum1, sum2; + + // sum the results together, saturating only on the final step + // adding x0 with x2 and x1 with x3 is the only order that prevents + // outranges for all filters + sum1 = _mm_add_epi16(x0, x2); + sum2 = _mm_add_epi16(x1, x3); + // add the rounding offset early to avoid another saturated add + sum1 = _mm_add_epi16(sum1, k_64); + sum1 = _mm_adds_epi16(sum1, sum2); + // shift by 7 bit each 16 bit + sum1 = _mm_srai_epi16(sum1, 7); + return sum1; } #undef MM256_BROADCASTSI128_SI256 diff --git a/vpx_dsp/x86/convolve_ssse3.h b/vpx_dsp/x86/convolve_ssse3.h index 8da28f0b2..e5d452f99 100644 --- a/vpx_dsp/x86/convolve_ssse3.h +++ b/vpx_dsp/x86/convolve_ssse3.h @@ -48,16 +48,19 @@ static INLINE __m128i convolve8_8_ssse3(const __m128i *const s, const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); - // add and saturate the results together - const __m128i min_x2x1 = _mm_min_epi16(x2, x1); - const __m128i max_x2x1 = _mm_max_epi16(x2, x1); - __m128i temp = _mm_adds_epi16(x0, x3); - temp = _mm_adds_epi16(temp, min_x2x1); - temp = _mm_adds_epi16(temp, max_x2x1); - // round and shift by 7 bit each 16 bit - temp = _mm_adds_epi16(temp, k_64); - temp = _mm_srai_epi16(temp, 7); - return temp; + __m128i sum1, sum2; + + // sum the results together, saturating only on the final step + // adding x0 with x2 and x1 with x3 is the only order that prevents + // outranges for all filters + sum1 = _mm_add_epi16(x0, x2); + sum2 = _mm_add_epi16(x1, x3); + // add the rounding offset early to avoid another saturated add + sum1 = _mm_add_epi16(sum1, k_64); + sum1 = _mm_adds_epi16(sum1, sum2); + // shift by 7 bit each 16 bit + sum1 = _mm_srai_epi16(sum1, 7); + return sum1; } static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s, diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c index 5a94c69b5..5b16022d4 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c @@ -38,8 +38,8 @@ void vpx_filter_block1d4_h8_intrin_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i firstFilters, secondFilters, shuffle1, shuffle2; - __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; - __m128i addFilterReg64, filtersReg, srcReg, minReg; + __m128i srcRegFilt1, srcRegFilt2; + __m128i addFilterReg64, filtersReg, srcReg; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 @@ -75,18 +75,16 @@ void vpx_filter_block1d4_h8_intrin_ssse3( srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); - // extract the higher half of the lane - srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); - srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); + // sum the results together, saturating only on the final step + // the specific order of the additions prevents outranges + srcRegFilt1 = _mm_add_epi16(srcRegFilt1, srcRegFilt2); - minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); + // extract the higher half of the register + srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8); - // add and saturate all the results together - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); - srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + // add the rounding offset early to avoid another saturated add + srcRegFilt1 = _mm_add_epi16(srcRegFilt1, addFilterReg64); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); // shift by 7 bit each 16 bits srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); |