summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--vpx_dsp/x86/convolve_avx2.h41
-rw-r--r--vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c578
2 files changed, 613 insertions, 6 deletions
diff --git a/vpx_dsp/x86/convolve_avx2.h b/vpx_dsp/x86/convolve_avx2.h
index 343af9fd0..e9fc9c06a 100644
--- a/vpx_dsp/x86/convolve_avx2.h
+++ b/vpx_dsp/x86/convolve_avx2.h
@@ -100,6 +100,47 @@ static INLINE __m128i convolve8_8_avx2(const __m256i *const s,
return sum1;
}
+static INLINE __m256i mm256_loadu2_si128(const void *lo, const void *hi) {
+ const __m256i tmp =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)lo));
+ return _mm256_inserti128_si256(tmp, _mm_loadu_si128((const __m128i *)hi), 1);
+}
+
+static INLINE __m256i mm256_loadu2_epi64(const void *lo, const void *hi) {
+ const __m256i tmp =
+ _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)lo));
+ return _mm256_inserti128_si256(tmp, _mm_loadl_epi64((const __m128i *)hi), 1);
+}
+
+static INLINE void mm256_store2_si128(__m128i *const dst_ptr_1,
+ __m128i *const dst_ptr_2,
+ const __m256i *const src) {
+ _mm_store_si128(dst_ptr_1, _mm256_castsi256_si128(*src));
+ _mm_store_si128(dst_ptr_2, _mm256_extractf128_si256(*src, 1));
+}
+
+static INLINE void mm256_storeu2_epi64(__m128i *const dst_ptr_1,
+ __m128i *const dst_ptr_2,
+ const __m256i *const src) {
+ _mm_storel_epi64(dst_ptr_1, _mm256_castsi256_si128(*src));
+ _mm_storel_epi64(dst_ptr_2, _mm256_extractf128_si256(*src, 1));
+}
+
+static INLINE void mm256_storeu2_epi32(__m128i *const dst_ptr_1,
+ __m128i *const dst_ptr_2,
+ const __m256i *const src) {
+ *((uint32_t *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src));
+ *((uint32_t *)(dst_ptr_2)) =
+ _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1));
+}
+
+static INLINE __m256i mm256_round_epi16(const __m256i *const src,
+ const __m256i *const half_depth,
+ const int depth) {
+ const __m256i nearest_src = _mm256_adds_epi16(*src, *half_depth);
+ return _mm256_srai_epi16(nearest_src, depth);
+}
+
#undef MM256_BROADCASTSI128_SI256
#endif // VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 426b82592..0ccf89694 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -9,10 +9,12 @@
*/
#include <immintrin.h>
+#include <stdio.h>
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/x86/convolve.h"
#include "vpx_dsp/x86/convolve_avx2.h"
+#include "vpx_dsp/x86/convolve_sse2.h"
#include "vpx_ports/mem.h"
// filters for 16_h8
@@ -326,6 +328,576 @@ static void vpx_filter_block1d16_v8_avg_avx2(
height, filter, 1);
}
+void vpx_filter_block1d16_h4_avx2(const uint8_t *src_ptr, ptrdiff_t src_stride,
+ uint8_t *dst_ptr, ptrdiff_t dst_stride,
+ uint32_t height, const int16_t *kernel) {
+ // We will cast the kernel from 16-bit words to 8-bit words, and then extract
+ // the middle four elements of the kernel into two registers in the form
+ // ... k[3] k[2] k[3] k[2]
+ // ... k[5] k[4] k[5] k[4]
+ // Then we shuffle the source into
+ // ... s[1] s[0] s[0] s[-1]
+ // ... s[3] s[2] s[2] s[1]
+ // Calling multiply and add gives us half of the sum. Calling add gives us
+ // first half of the output. Repeat again to get the second half of the
+ // output. Finally we shuffle again to combine the two outputs.
+ // Since avx2 allows us to use 256-bit buffer, we can do this two rows at a
+ // time.
+
+ __m128i kernel_reg; // Kernel
+ __m256i kernel_reg_256, kernel_reg_23,
+ kernel_reg_45; // Segments of the kernel used
+ const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding
+ const ptrdiff_t unrolled_src_stride = src_stride << 1;
+ const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
+ int h;
+
+ __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
+ __m256i dst_first, dst_second;
+ __m256i tmp_0, tmp_1;
+ __m256i idx_shift_0 =
+ _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1,
+ 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+ __m256i idx_shift_2 =
+ _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3,
+ 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+ kernel_reg_256 = _mm256_broadcastsi128_si256(kernel_reg);
+ kernel_reg_23 =
+ _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0302u));
+ kernel_reg_45 =
+ _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0504u));
+
+ for (h = height; h >= 2; h -= 2) {
+ // Load the source
+ src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
+ src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+ src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+ // Partial result for first half
+ tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+ tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+ dst_first = _mm256_adds_epi16(tmp_0, tmp_1);
+
+ // Do again to get the second half of dst
+ // Load the source
+ src_reg = mm256_loadu2_si128(src_ptr + 8, src_ptr + src_stride + 8);
+ src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+ src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+ // Partial result for second half
+ tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+ tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+ dst_second = _mm256_adds_epi16(tmp_0, tmp_1);
+
+ // Round each result
+ dst_first = mm256_round_epi16(&dst_first, &reg_32, 6);
+ dst_second = mm256_round_epi16(&dst_second, &reg_32, 6);
+
+ // Finally combine to get the final dst
+ dst_first = _mm256_packus_epi16(dst_first, dst_second);
+ mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+ &dst_first);
+
+ src_ptr += unrolled_src_stride;
+ dst_ptr += unrolled_dst_stride;
+ }
+
+ // Repeat for the last row if needed
+ if (h > 0) {
+ src_reg = _mm256_loadu_si256((const __m256i *)src_ptr);
+ // Reorder into 2 1 1 2
+ src_reg = _mm256_permute4x64_epi64(src_reg, 0x94);
+
+ src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+ src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+ tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+ tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+ dst_first = _mm256_adds_epi16(tmp_0, tmp_1);
+
+ dst_first = mm256_round_epi16(&dst_first, &reg_32, 6);
+
+ dst_first = _mm256_packus_epi16(dst_first, dst_first);
+ dst_first = _mm256_permute4x64_epi64(dst_first, 0x8);
+
+ _mm_store_si128((__m128i *)dst_ptr, _mm256_castsi256_si128(dst_first));
+ }
+}
+
+void vpx_filter_block1d16_v4_avx2(const uint8_t *src_ptr, ptrdiff_t src_stride,
+ uint8_t *dst_ptr, ptrdiff_t dst_stride,
+ uint32_t height, const int16_t *kernel) {
+ // We will load two rows of pixels as 8-bit words, rearrange them into the
+ // form
+ // ... s[1,0] s[0,0] s[0,0] s[-1,0]
+ // so that we can call multiply and add with the kernel partial output. Then
+ // we can call add with another row to get the output.
+
+ // Register for source s[-1:3, :]
+ __m256i src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source. lo is first half, hi second
+ __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
+ __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi;
+
+ __m128i kernel_reg; // Kernel
+ __m256i kernel_reg_256, kernel_reg_23,
+ kernel_reg_45; // Segments of the kernel used
+
+ // Result after multiply and add
+ __m256i res_reg_m1001_lo, res_reg_1223_lo, res_reg_m1001_hi, res_reg_1223_hi;
+ __m256i res_reg, res_reg_lo, res_reg_hi;
+
+ const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding
+
+ // We will compute the result two rows at a time
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // We only need to go num_taps/2 - 1 row above the souce, so we move
+ // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
+ src_ptr += src_stride_unrolled;
+
+ // Load Kernel
+ kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+ kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+ kernel_reg_256 = _mm256_broadcastsi128_si256(kernel_reg);
+ kernel_reg_23 =
+ _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0302u));
+ kernel_reg_45 =
+ _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0504u));
+
+ // Row -1 to row 0
+ src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr,
+ (const __m128i *)(src_ptr + src_stride));
+
+ // Row 0 to row 1
+ src_reg_1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
+ src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
+
+ // First three rows
+ src_reg_m1001_lo = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01);
+ src_reg_m1001_hi = _mm256_unpackhi_epi8(src_reg_m10, src_reg_01);
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)));
+
+ src_reg_12 = _mm256_inserti128_si256(src_reg_1,
+ _mm256_castsi256_si128(src_reg_2), 1);
+
+ src_reg_3 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)));
+
+ src_reg_23 = _mm256_inserti128_si256(src_reg_2,
+ _mm256_castsi256_si128(src_reg_3), 1);
+
+ // Last three rows
+ src_reg_1223_lo = _mm256_unpacklo_epi8(src_reg_12, src_reg_23);
+ src_reg_1223_hi = _mm256_unpackhi_epi8(src_reg_12, src_reg_23);
+
+ // Output from first half
+ res_reg_m1001_lo = _mm256_maddubs_epi16(src_reg_m1001_lo, kernel_reg_23);
+ res_reg_1223_lo = _mm256_maddubs_epi16(src_reg_1223_lo, kernel_reg_45);
+ res_reg_lo = _mm256_adds_epi16(res_reg_m1001_lo, res_reg_1223_lo);
+
+ // Output from second half
+ res_reg_m1001_hi = _mm256_maddubs_epi16(src_reg_m1001_hi, kernel_reg_23);
+ res_reg_1223_hi = _mm256_maddubs_epi16(src_reg_1223_hi, kernel_reg_45);
+ res_reg_hi = _mm256_adds_epi16(res_reg_m1001_hi, res_reg_1223_hi);
+
+ // Round the words
+ res_reg_lo = mm256_round_epi16(&res_reg_lo, &reg_32, 6);
+ res_reg_hi = mm256_round_epi16(&res_reg_hi, &reg_32, 6);
+
+ // Combine to get the result
+ res_reg = _mm256_packus_epi16(res_reg_lo, res_reg_hi);
+
+ // Save the result
+ mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+ &res_reg);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m1001_lo = src_reg_1223_lo;
+ src_reg_m1001_hi = src_reg_1223_hi;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr, ptrdiff_t src_stride,
+ uint8_t *dst_ptr, ptrdiff_t dst_stride,
+ uint32_t height, const int16_t *kernel) {
+ // We will cast the kernel from 16-bit words to 8-bit words, and then extract
+ // the middle four elements of the kernel into two registers in the form
+ // ... k[3] k[2] k[3] k[2]
+ // ... k[5] k[4] k[5] k[4]
+ // Then we shuffle the source into
+ // ... s[1] s[0] s[0] s[-1]
+ // ... s[3] s[2] s[2] s[1]
+ // Calling multiply and add gives us half of the sum. Calling add gives us
+ // first half of the output. Repeat again to get the second half of the
+ // output. Finally we shuffle again to combine the two outputs.
+ // Since avx2 allows us to use 256-bit buffer, we can do this two rows at a
+ // time.
+
+ __m128i kernel_reg_128; // Kernel
+ __m256i kernel_reg, kernel_reg_23,
+ kernel_reg_45; // Segments of the kernel used
+ const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding
+ const ptrdiff_t unrolled_src_stride = src_stride << 1;
+ const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
+ int h;
+
+ __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
+ __m256i dst_reg;
+ __m256i tmp_0, tmp_1;
+ __m256i idx_shift_0 =
+ _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1,
+ 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+ __m256i idx_shift_2 =
+ _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3,
+ 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
+ kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128);
+ kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+ kernel_reg_23 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0302u));
+ kernel_reg_45 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0504u));
+
+ for (h = height; h >= 2; h -= 2) {
+ // Load the source
+ src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
+ src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+ src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+ // Get the output
+ tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+ tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+ dst_reg = _mm256_adds_epi16(tmp_0, tmp_1);
+
+ // Round the result
+ dst_reg = mm256_round_epi16(&dst_reg, &reg_32, 6);
+
+ // Finally combine to get the final dst
+ dst_reg = _mm256_packus_epi16(dst_reg, dst_reg);
+ mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+ &dst_reg);
+
+ src_ptr += unrolled_src_stride;
+ dst_ptr += unrolled_dst_stride;
+ }
+
+ // Repeat for the last row if needed
+ if (h > 0) {
+ __m128i src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+ __m128i dst_reg;
+ const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+ __m128i tmp_0, tmp_1;
+
+ __m128i src_reg_shift_0 =
+ _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(idx_shift_0));
+ __m128i src_reg_shift_2 =
+ _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(idx_shift_2));
+
+ tmp_0 = _mm_maddubs_epi16(src_reg_shift_0,
+ _mm256_castsi256_si128(kernel_reg_23));
+ tmp_1 = _mm_maddubs_epi16(src_reg_shift_2,
+ _mm256_castsi256_si128(kernel_reg_45));
+ dst_reg = _mm_adds_epi16(tmp_0, tmp_1);
+
+ dst_reg = round_epi16_sse2(&dst_reg, &reg_32, 6);
+
+ dst_reg = _mm_packus_epi16(dst_reg, _mm_setzero_si128());
+
+ _mm_storel_epi64((__m128i *)dst_ptr, dst_reg);
+ }
+}
+
+void vpx_filter_block1d8_v4_avx2(const uint8_t *src_ptr, ptrdiff_t src_stride,
+ uint8_t *dst_ptr, ptrdiff_t dst_stride,
+ uint32_t height, const int16_t *kernel) {
+ // We will load two rows of pixels as 8-bit words, rearrange them into the
+ // form
+ // ... s[1,0] s[0,0] s[0,0] s[-1,0]
+ // so that we can call multiply and add with the kernel partial output. Then
+ // we can call add with another row to get the output.
+
+ // Register for source s[-1:3, :]
+ __m256i src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source. lo is first half, hi second
+ __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
+ __m256i src_reg_m1001, src_reg_1223;
+
+ __m128i kernel_reg_128; // Kernel
+ __m256i kernel_reg, kernel_reg_23,
+ kernel_reg_45; // Segments of the kernel used
+
+ // Result after multiply and add
+ __m256i res_reg_m1001, res_reg_1223;
+ __m256i res_reg;
+
+ const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding
+
+ // We will compute the result two rows at a time
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // We only need to go num_taps/2 - 1 row above the souce, so we move
+ // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
+ src_ptr += src_stride_unrolled;
+
+ // Load Kernel
+ kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
+ kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128);
+ kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+ kernel_reg_23 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0302u));
+ kernel_reg_45 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0504u));
+
+ // Row -1 to row 0
+ src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr,
+ (const __m128i *)(src_ptr + src_stride));
+
+ // Row 0 to row 1
+ src_reg_1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
+ src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
+
+ // First three rows
+ src_reg_m1001 = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01);
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)));
+
+ src_reg_12 = _mm256_inserti128_si256(src_reg_1,
+ _mm256_castsi256_si128(src_reg_2), 1);
+
+ src_reg_3 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)));
+
+ src_reg_23 = _mm256_inserti128_si256(src_reg_2,
+ _mm256_castsi256_si128(src_reg_3), 1);
+
+ // Last three rows
+ src_reg_1223 = _mm256_unpacklo_epi8(src_reg_12, src_reg_23);
+
+ // Output
+ res_reg_m1001 = _mm256_maddubs_epi16(src_reg_m1001, kernel_reg_23);
+ res_reg_1223 = _mm256_maddubs_epi16(src_reg_1223, kernel_reg_45);
+ res_reg = _mm256_adds_epi16(res_reg_m1001, res_reg_1223);
+
+ // Round the words
+ res_reg = mm256_round_epi16(&res_reg, &reg_32, 6);
+
+ // Combine to get the result
+ res_reg = _mm256_packus_epi16(res_reg, res_reg);
+
+ // Save the result
+ mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+ &res_reg);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m1001 = src_reg_1223;
+ src_reg_1 = src_reg_3;
+ }
+}
+
+void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr, ptrdiff_t src_stride,
+ uint8_t *dst_ptr, ptrdiff_t dst_stride,
+ uint32_t height, const int16_t *kernel) {
+ // We will cast the kernel from 16-bit words to 8-bit words, and then extract
+ // the middle four elements of the kernel into a single register in the form
+ // k[5:2] k[5:2] k[5:2] k[5:2]
+ // Then we shuffle the source into
+ // s[5:2] s[4:1] s[3:0] s[2:-1]
+ // Calling multiply and add gives us half of the sum next to each other.
+ // Calling horizontal add then gives us the output.
+ // Since avx2 has 256-bit register, we can do 2 rows at a time.
+
+ __m128i kernel_reg_128; // Kernel
+ __m256i kernel_reg;
+ const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding
+ int h;
+ const ptrdiff_t unrolled_src_stride = src_stride << 1;
+ const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
+
+ __m256i src_reg, src_reg_shuf;
+ __m256i dst;
+ __m256i shuf_idx =
+ _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2,
+ 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6);
+
+ // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+ src_ptr -= 1;
+
+ // Load Kernel
+ kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
+ kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128);
+ kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+ kernel_reg = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi32(0x05040302u));
+
+ for (h = height; h > 1; h -= 2) {
+ // Load the source
+ src_reg = mm256_loadu2_epi64((const __m128i *)src_ptr,
+ (const __m128i *)(src_ptr + src_stride));
+ src_reg_shuf = _mm256_shuffle_epi8(src_reg, shuf_idx);
+
+ // Get the result
+ dst = _mm256_maddubs_epi16(src_reg_shuf, kernel_reg);
+ dst = _mm256_hadds_epi16(dst, _mm256_setzero_si256());
+
+ // Round result
+ dst = mm256_round_epi16(&dst, &reg_32, 6);
+
+ // Pack to 8-bits
+ dst = _mm256_packus_epi16(dst, _mm256_setzero_si256());
+
+ // Save
+ mm256_storeu2_epi32((__m128i *const)dst_ptr,
+ (__m128i *const)(dst_ptr + dst_stride), &dst);
+
+ src_ptr += unrolled_src_stride;
+ dst_ptr += unrolled_dst_stride;
+ }
+
+ if (h > 0) {
+ // Load the source
+ const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
+ __m128i src_reg = _mm_loadl_epi64((const __m128i *)src_ptr);
+ __m128i src_reg_shuf =
+ _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(shuf_idx));
+
+ // Get the result
+ __m128i dst =
+ _mm_maddubs_epi16(src_reg_shuf, _mm256_castsi256_si128(kernel_reg));
+ dst = _mm_hadds_epi16(dst, _mm_setzero_si128());
+
+ // Round result
+ dst = round_epi16_sse2(&dst, &reg_32, 6);
+
+ // Pack to 8-bits
+ dst = _mm_packus_epi16(dst, _mm_setzero_si128());
+ *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst);
+ }
+}
+
+void vpx_filter_block1d4_v4_avx2(const uint8_t *src_ptr, ptrdiff_t src_stride,
+ uint8_t *dst_ptr, ptrdiff_t dst_stride,
+ uint32_t height, const int16_t *kernel) {
+ // We will load two rows of pixels as 8-bit words, rearrange them into the
+ // form
+ // ... s[3,0] s[2,0] s[1,0] s[0,0] s[2,0] s[1,0] s[0,0] s[-1,0]
+ // so that we can call multiply and add with the kernel to get partial output.
+ // Calling horizontal add then gives us the completely output
+
+ // Register for source s[-1:3, :]
+ __m256i src_reg_1, src_reg_2, src_reg_3;
+ // Interleaved rows of the source. lo is first half, hi second
+ __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
+ __m256i src_reg_m1001, src_reg_1223, src_reg_m1012_1023;
+
+ __m128i kernel_reg_128; // Kernel
+ __m256i kernel_reg;
+
+ // Result after multiply and add
+ __m256i res_reg;
+
+ const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding
+
+ // We will compute the result two rows at a time
+ const ptrdiff_t src_stride_unrolled = src_stride << 1;
+ const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+ int h;
+
+ // We only need to go num_taps/2 - 1 row above the souce, so we move
+ // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
+ src_ptr += src_stride_unrolled;
+
+ // Load Kernel
+ kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+ kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
+ kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128);
+ kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+ kernel_reg = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi32(0x05040302u));
+
+ // Row -1 to row 0
+ src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr,
+ (const __m128i *)(src_ptr + src_stride));
+
+ // Row 0 to row 1
+ src_reg_1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
+ src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
+
+ // First three rows
+ src_reg_m1001 = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01);
+
+ for (h = height; h > 1; h -= 2) {
+ src_reg_2 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)));
+
+ src_reg_12 = _mm256_inserti128_si256(src_reg_1,
+ _mm256_castsi256_si128(src_reg_2), 1);
+
+ src_reg_3 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)));
+
+ src_reg_23 = _mm256_inserti128_si256(src_reg_2,
+ _mm256_castsi256_si128(src_reg_3), 1);
+
+ // Last three rows
+ src_reg_1223 = _mm256_unpacklo_epi8(src_reg_12, src_reg_23);
+
+ // Combine all the rows
+ src_reg_m1012_1023 = _mm256_unpacklo_epi16(src_reg_m1001, src_reg_1223);
+
+ // Output
+ res_reg = _mm256_maddubs_epi16(src_reg_m1012_1023, kernel_reg);
+ res_reg = _mm256_hadds_epi16(res_reg, _mm256_setzero_si256());
+
+ // Round the words
+ res_reg = mm256_round_epi16(&res_reg, &reg_32, 6);
+
+ // Combine to get the result
+ res_reg = _mm256_packus_epi16(res_reg, res_reg);
+
+ // Save the result
+ mm256_storeu2_epi32((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+ &res_reg);
+
+ // Update the source by two rows
+ src_ptr += src_stride_unrolled;
+ dst_ptr += dst_stride_unrolled;
+
+ src_reg_m1001 = src_reg_1223;
+ src_reg_1 = src_reg_3;
+ }
+}
+
#if HAVE_AVX2 && HAVE_SSSE3
filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
#if ARCH_X86_64
@@ -377,12 +949,6 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
#define vpx_filter_block1d4_v2_avg_avx2 vpx_filter_block1d4_v2_avg_ssse3
#define vpx_filter_block1d4_h2_avg_avx2 vpx_filter_block1d4_h2_avg_ssse3
-#define vpx_filter_block1d16_v4_avx2 vpx_filter_block1d16_v8_avx2
-#define vpx_filter_block1d16_h4_avx2 vpx_filter_block1d16_h8_avx2
-#define vpx_filter_block1d8_v4_avx2 vpx_filter_block1d8_v8_avx2
-#define vpx_filter_block1d8_h4_avx2 vpx_filter_block1d8_h8_avx2
-#define vpx_filter_block1d4_v4_avx2 vpx_filter_block1d4_v8_avx2
-#define vpx_filter_block1d4_h4_avx2 vpx_filter_block1d4_h8_avx2
#define vpx_filter_block1d16_v4_avg_avx2 vpx_filter_block1d16_v8_avg_avx2
#define vpx_filter_block1d16_h4_avg_avx2 vpx_filter_block1d16_h8_avg_avx2
#define vpx_filter_block1d8_v4_avg_avx2 vpx_filter_block1d8_v8_avg_avx2