summaryrefslogtreecommitdiff
path: root/vpx_dsp/x86/highbd_convolve_avx2.c
diff options
context:
space:
mode:
authorchiyotsai <chiyotsai@google.com>2018-11-02 17:08:05 -0700
committerChi Yo Tsai <chiyotsai@google.com>2019-01-15 20:02:19 +0000
commitc182725cbc9e1e4892784a24c32b1bed80047b0c (patch)
tree4395b294c8b94afada7bee4393771ad82e26ebea /vpx_dsp/x86/highbd_convolve_avx2.c
parent19882cdbf9518f35eaeccff9702dbb5bab708e06 (diff)
downloadlibvpx-c182725cbc9e1e4892784a24c32b1bed80047b0c.tar
libvpx-c182725cbc9e1e4892784a24c32b1bed80047b0c.tar.gz
libvpx-c182725cbc9e1e4892784a24c32b1bed80047b0c.tar.bz2
libvpx-c182725cbc9e1e4892784a24c32b1bed80047b0c.zip
Remove unnecessary calculation in 4-tap interpolation filter
Reduces the number of rows calculated for 2D 4-tap interpolation filter from h+7 rows to h+3 rows. Also fixes a bug in the avx2 function for 4-tap filters where the last row is computed incorrectly. Performance: | Baseline | Result | Pct Gain | bitdepth lo| 4.00 fps | 4.02 fps | 0.5% | bitdepth 10| 1.90 fps | 1.91 fps | 0.5% | The performance is evaluated on speed 1 on jets.y4m br 500 over 100 frames. No BDBR loss is observed. Change-Id: I90b0d4d697319b7bba599f03c5dc01abd85d13b1
Diffstat (limited to 'vpx_dsp/x86/highbd_convolve_avx2.c')
-rw-r--r--vpx_dsp/x86/highbd_convolve_avx2.c34
1 files changed, 12 insertions, 22 deletions
diff --git a/vpx_dsp/x86/highbd_convolve_avx2.c b/vpx_dsp/x86/highbd_convolve_avx2.c
index aef067ea7..320962561 100644
--- a/vpx_dsp/x86/highbd_convolve_avx2.c
+++ b/vpx_dsp/x86/highbd_convolve_avx2.c
@@ -1089,22 +1089,19 @@ static void vpx_highbd_filter_block1d8_h4_avx2(
// Repeat for the last row if needed
if (h > 0) {
- src_reg = _mm256_loadu_si256((const __m256i *)src_ptr);
- // Reorder into 2 1 1 2
- src_reg = _mm256_permute4x64_epi64(src_reg, 0x94);
-
+ src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
&kernel_reg_23, &kernel_reg_45);
- res_reg = mm256_round_epi32(&res_first, &reg_round, CONV8_ROUNDING_BITS);
+ res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
res_reg = _mm256_packus_epi32(res_reg, res_reg);
- res_reg = _mm256_permute4x64_epi64(res_reg, 0x8);
+ res_reg = _mm256_min_epi16(res_reg, reg_max);
- _mm_store_si128((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg));
+ mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + 4), &res_reg);
}
}
@@ -1279,10 +1276,6 @@ static void vpx_highbd_filter_block1d4_v4_avx2(
const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
int h;
- // We only need to go num_taps/2 - 1 row above the souce, so we move
- // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
- src_ptr += src_stride_unrolled;
-
// Load Kernel
kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
@@ -1368,10 +1361,6 @@ static void vpx_highbd_filter_block1d8_v4_avx2(
const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
int h;
- // We only need to go num_taps/2 - 1 row above the souce, so we move
- // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
- src_ptr += src_stride_unrolled;
-
// Load Kernel
kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
@@ -1476,9 +1465,10 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
#define vpx_highbd_filter_block1d4_h4_avg_avx2 \
vpx_highbd_filter_block1d4_h8_avg_avx2
-HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);
-HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);
-HIGH_FUN_CONV_2D(, avx2);
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0);
+HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
+ src - src_stride * (num_taps / 2 - 1), , avx2, 0);
+HIGH_FUN_CONV_2D(, avx2, 0);
// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
@@ -1497,9 +1487,9 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
#define vpx_highbd_filter_block1d4_v2_avg_avx2 \
vpx_highbd_filter_block1d4_v2_avg_sse2
-HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2);
-HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_,
- avx2);
-HIGH_FUN_CONV_2D(avg_, avx2);
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1);
+HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+ src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1);
+HIGH_FUN_CONV_2D(avg_, avx2, 1);
#undef HIGHBD_FUNC