diff options
author | chiyotsai <chiyotsai@google.com> | 2018-11-02 17:08:05 -0700 |
---|---|---|
committer | Chi Yo Tsai <chiyotsai@google.com> | 2019-01-15 20:02:19 +0000 |
commit | c182725cbc9e1e4892784a24c32b1bed80047b0c (patch) | |
tree | 4395b294c8b94afada7bee4393771ad82e26ebea /vpx_dsp/x86/highbd_convolve_avx2.c | |
parent | 19882cdbf9518f35eaeccff9702dbb5bab708e06 (diff) | |
download | libvpx-c182725cbc9e1e4892784a24c32b1bed80047b0c.tar libvpx-c182725cbc9e1e4892784a24c32b1bed80047b0c.tar.gz libvpx-c182725cbc9e1e4892784a24c32b1bed80047b0c.tar.bz2 libvpx-c182725cbc9e1e4892784a24c32b1bed80047b0c.zip |
Remove unnecessary calculation in 4-tap interpolation filter
Reduces the number of rows calculated for 2D 4-tap interpolation filter
from h+7 rows to h+3 rows.
Also fixes a bug in the avx2 function for 4-tap filters where the last
row is computed incorrectly.
Performance:
| Baseline | Result | Pct Gain |
bitdepth lo| 4.00 fps | 4.02 fps | 0.5% |
bitdepth 10| 1.90 fps | 1.91 fps | 0.5% |
The performance is evaluated on speed 1 on jets.y4m br 500 over 100
frames.
No BDBR loss is observed.
Change-Id: I90b0d4d697319b7bba599f03c5dc01abd85d13b1
Diffstat (limited to 'vpx_dsp/x86/highbd_convolve_avx2.c')
-rw-r--r-- | vpx_dsp/x86/highbd_convolve_avx2.c | 34 |
1 files changed, 12 insertions, 22 deletions
diff --git a/vpx_dsp/x86/highbd_convolve_avx2.c b/vpx_dsp/x86/highbd_convolve_avx2.c index aef067ea7..320962561 100644 --- a/vpx_dsp/x86/highbd_convolve_avx2.c +++ b/vpx_dsp/x86/highbd_convolve_avx2.c @@ -1089,22 +1089,19 @@ static void vpx_highbd_filter_block1d8_h4_avx2( // Repeat for the last row if needed if (h > 0) { - src_reg = _mm256_loadu_si256((const __m256i *)src_ptr); - // Reorder into 2 1 1 2 - src_reg = _mm256_permute4x64_epi64(src_reg, 0x94); - + src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4); src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, &kernel_reg_23, &kernel_reg_45); - res_reg = mm256_round_epi32(&res_first, ®_round, CONV8_ROUNDING_BITS); + res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS); res_reg = _mm256_packus_epi32(res_reg, res_reg); - res_reg = _mm256_permute4x64_epi64(res_reg, 0x8); + res_reg = _mm256_min_epi16(res_reg, reg_max); - _mm_store_si128((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg)); + mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + 4), &res_reg); } } @@ -1279,10 +1276,6 @@ static void vpx_highbd_filter_block1d4_v4_avx2( const ptrdiff_t dst_stride_unrolled = dst_stride << 1; int h; - // We only need to go num_taps/2 - 1 row above the souce, so we move - // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down - src_ptr += src_stride_unrolled; - // Load Kernel kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); @@ -1368,10 +1361,6 @@ static void vpx_highbd_filter_block1d8_v4_avx2( const ptrdiff_t dst_stride_unrolled = dst_stride << 1; int h; - // We only need to go num_taps/2 - 1 row above the souce, so we move - // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down - src_ptr += src_stride_unrolled; - // Load Kernel kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); @@ -1476,9 +1465,10 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2; #define vpx_highbd_filter_block1d4_h4_avg_avx2 \ vpx_highbd_filter_block1d4_h8_avg_avx2 -HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2); -HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2); -HIGH_FUN_CONV_2D(, avx2); +HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0); +HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, + src - src_stride * (num_taps / 2 - 1), , avx2, 0); +HIGH_FUN_CONV_2D(, avx2, 0); // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm. highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2; @@ -1497,9 +1487,9 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2; #define vpx_highbd_filter_block1d4_v2_avg_avx2 \ vpx_highbd_filter_block1d4_v2_avg_sse2 -HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2); -HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, - avx2); -HIGH_FUN_CONV_2D(avg_, avx2); +HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1); +HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, + src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1); +HIGH_FUN_CONV_2D(avg_, avx2, 1); #undef HIGHBD_FUNC |