Remove unnecessary calculation in 4-tap interpolation filter

Reduces the number of rows calculated for 2D 4-tap interpolation filter from h+7 rows to h+3 rows. Also fixes a bug in the avx2 function for 4-tap filters where the last row is computed incorrectly. Performance: | Baseline | Result | Pct Gain | bitdepth lo| 4.00 fps | 4.02 fps | 0.5% | bitdepth 10| 1.90 fps | 1.91 fps | 0.5% | The performance is evaluated on speed 1 on jets.y4m br 500 over 100 frames. No BDBR loss is observed. Change-Id: I90b0d4d697319b7bba599f03c5dc01abd85d13b1
author: chiyotsai <chiyotsai@google.com> 2018-11-02 17:08:05 -0700
committer: Chi Yo Tsai <chiyotsai@google.com> 2019-01-15 20:02:19 +0000
commit: c182725cbc9e1e4892784a24c32b1bed80047b0c (patch)
tree: 4395b294c8b94afada7bee4393771ad82e26ebea /vpx_dsp/x86/highbd_convolve_avx2.c
parent: 19882cdbf9518f35eaeccff9702dbb5bab708e06 (diff)
download: libvpx-c182725cbc9e1e4892784a24c32b1bed80047b0c.tar
libvpx-c182725cbc9e1e4892784a24c32b1bed80047b0c.tar.gz
libvpx-c182725cbc9e1e4892784a24c32b1bed80047b0c.tar.bz2
libvpx-c182725cbc9e1e4892784a24c32b1bed80047b0c.zip
1 files changed, 12 insertions, 22 deletions
diff --git a/vpx_dsp/x86/highbd_convolve_avx2.c b/vpx_dsp/x86/highbd_convolve_avx2.c
index aef067ea7..320962561 100644
--- a/vpx_dsp/x86/highbd_convolve_avx2.c
+++ b/vpx_dsp/x86/highbd_convolve_avx2.c
@@ -1089,22 +1089,19 @@ static void vpx_highbd_filter_block1d8_h4_avx2(
 
   // Repeat for the last row if needed
   if (h > 0) {
-    src_reg = _mm256_loadu_si256((const __m256i *)src_ptr);
-    // Reorder into 2 1 1 2
-    src_reg = _mm256_permute4x64_epi64(src_reg, 0x94);
-
+    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
     src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
     src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
 
     res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
                                    &kernel_reg_23, &kernel_reg_45);
 
-    res_reg = mm256_round_epi32(&res_first, &reg_round, CONV8_ROUNDING_BITS);
+    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
 
     res_reg = _mm256_packus_epi32(res_reg, res_reg);
-    res_reg = _mm256_permute4x64_epi64(res_reg, 0x8);
+    res_reg = _mm256_min_epi16(res_reg, reg_max);
 
-    _mm_store_si128((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg));
+    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + 4), &res_reg);
   }
 }
 
@@ -1279,10 +1276,6 @@ static void vpx_highbd_filter_block1d4_v4_avx2(
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the souce, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
@@ -1368,10 +1361,6 @@ static void vpx_highbd_filter_block1d8_v4_avx2(
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the souce, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
@@ -1476,9 +1465,10 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
 #define vpx_highbd_filter_block1d4_h4_avg_avx2 \
   vpx_highbd_filter_block1d4_h8_avg_avx2
 
-HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);
-HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);
-HIGH_FUN_CONV_2D(, avx2);
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0);
+HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
+                 src - src_stride * (num_taps / 2 - 1), , avx2, 0);
+HIGH_FUN_CONV_2D(, avx2, 0);
 
 // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
@@ -1497,9 +1487,9 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
 #define vpx_highbd_filter_block1d4_v2_avg_avx2 \
   vpx_highbd_filter_block1d4_v2_avg_sse2
 
-HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2);
-HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_,
-                 avx2);
-HIGH_FUN_CONV_2D(avg_, avx2);
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1);
+HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+                 src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1);
+HIGH_FUN_CONV_2D(avg_, avx2, 1);
 
 #undef HIGHBD_FUNC
author	chiyotsai <chiyotsai@google.com>	2018-11-02 17:08:05 -0700
committer	Chi Yo Tsai <chiyotsai@google.com>	2019-01-15 20:02:19 +0000
commit	c182725cbc9e1e4892784a24c32b1bed80047b0c (patch)
tree	4395b294c8b94afada7bee4393771ad82e26ebea /vpx_dsp/x86/highbd_convolve_avx2.c
parent	19882cdbf9518f35eaeccff9702dbb5bab708e06 (diff)
download	libvpx-c182725cbc9e1e4892784a24c32b1bed80047b0c.tar libvpx-c182725cbc9e1e4892784a24c32b1bed80047b0c.tar.gz libvpx-c182725cbc9e1e4892784a24c32b1bed80047b0c.tar.bz2 libvpx-c182725cbc9e1e4892784a24c32b1bed80047b0c.zip