diff options
-rw-r--r-- | test/convolve_test.cc | 52 | ||||
-rw-r--r-- | test/vpx_scale_test.h | 28 | ||||
-rw-r--r-- | vp8/common/mips/mmi/idct_blk_mmi.c | 71 | ||||
-rw-r--r-- | vp8/common/rtcd_defs.pl | 4 | ||||
-rw-r--r-- | vp8/vp8_common.mk | 1 | ||||
-rw-r--r-- | vp9/encoder/vp9_frame_scale.c | 2 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_frame_scale_ssse3.c | 226 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp.mk | 1 | ||||
-rw-r--r-- | vpx_dsp/x86/convolve_avx2.h | 99 | ||||
-rw-r--r-- | vpx_dsp/x86/convolve_ssse3.h | 61 | ||||
-rw-r--r-- | vpx_dsp/x86/mem_sse2.h | 8 | ||||
-rw-r--r-- | vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 557 |
12 files changed, 678 insertions, 432 deletions
diff --git a/test/convolve_test.cc b/test/convolve_test.cc index 9f6f795c9..08ef57224 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -925,33 +925,51 @@ TEST_P(ConvolveTest, FilterExtremes) { /* This test exercises that enough rows and columns are filtered with every possible initial fractional positions and scaling steps. */ +#if !CONFIG_VP9_HIGHBITDEPTH +static const ConvolveFunc scaled_2d_c_funcs[2] = { vpx_scaled_2d_c, + vpx_scaled_avg_2d_c }; + TEST_P(ConvolveTest, CheckScalingFiltering) { uint8_t *const in = input(); uint8_t *const out = output(); - const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP]; + uint8_t ref[kOutputStride * kMaxDimension]; - SetConstantInput(127); + ::libvpx_test::ACMRandom prng; + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) { + const uint16_t r = prng.Rand8Extremes(); + assign_val(in, y * kInputStride + x, r); + } + } - for (int frac = 0; frac < 16; ++frac) { - for (int step = 1; step <= 32; ++step) { - /* Test the horizontal and vertical filters in combination. */ - ASM_REGISTER_STATE_CHECK( - UUT_->shv8_[0](in, kInputStride, out, kOutputStride, eighttap, frac, - step, frac, step, Width(), Height())); - - CheckGuardBlocks(); - - for (int y = 0; y < Height(); ++y) { - for (int x = 0; x < Width(); ++x) { - ASSERT_EQ(lookup(in, y * kInputStride + x), - lookup(out, y * kOutputStride + x)) - << "x == " << x << ", y == " << y << ", frac == " << frac - << ", step == " << step; + for (int i = 0; i < 2; ++i) { + for (INTERP_FILTER filter_type = 0; filter_type < 4; ++filter_type) { + const InterpKernel *const eighttap = vp9_filter_kernels[filter_type]; + for (int frac = 0; frac < 16; ++frac) { + for (int step = 1; step <= 32; ++step) { + /* Test the horizontal and vertical filters in combination. */ + scaled_2d_c_funcs[i](in, kInputStride, ref, kOutputStride, eighttap, + frac, step, frac, step, Width(), Height()); + ASM_REGISTER_STATE_CHECK( + UUT_->shv8_[i](in, kInputStride, out, kOutputStride, eighttap, + frac, step, frac, step, Width(), Height())); + + CheckGuardBlocks(); + + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) { + ASSERT_EQ(lookup(ref, y * kOutputStride + x), + lookup(out, y * kOutputStride + x)) + << "x == " << x << ", y == " << y << ", frac == " << frac + << ", step == " << step; + } + } } } } } } +#endif using std::tr1::make_tuple; diff --git a/test/vpx_scale_test.h b/test/vpx_scale_test.h index 18909d1b5..dcbd02b91 100644 --- a/test/vpx_scale_test.h +++ b/test/vpx_scale_test.h @@ -15,11 +15,14 @@ #include "./vpx_config.h" #include "./vpx_scale_rtcd.h" +#include "test/acm_random.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "vpx_mem/vpx_mem.h" #include "vpx_scale/yv12config.h" +using libvpx_test::ACMRandom; + namespace libvpx_test { class VpxScaleBase { @@ -65,12 +68,12 @@ class VpxScaleBase { ResetScaleImage(&img_, src_width, src_height); ResetScaleImage(&ref_img_, dst_width, dst_height); ResetScaleImage(&dst_img_, dst_width, dst_height); - FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height, - img_.y_stride); - FillPlane(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height, - img_.uv_stride); - FillPlane(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height, - img_.uv_stride); + FillPlaneExtreme(img_.y_buffer, img_.y_crop_width, img_.y_crop_height, + img_.y_stride); + FillPlaneExtreme(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height, + img_.uv_stride); + FillPlaneExtreme(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height, + img_.uv_stride); } void DeallocImages() { @@ -89,7 +92,8 @@ class VpxScaleBase { static const int kBufFiller = 123; static const int kBufMax = kBufFiller - 1; - static void FillPlane(uint8_t *buf, int width, int height, int stride) { + static void FillPlane(uint8_t *const buf, const int width, const int height, + const int stride) { for (int y = 0; y < height; ++y) { for (int x = 0; x < width; ++x) { buf[x + (y * stride)] = (x + (width * y)) % kBufMax; @@ -97,6 +101,16 @@ class VpxScaleBase { } } + static void FillPlaneExtreme(uint8_t *const buf, const int width, + const int height, const int stride) { + ACMRandom rnd; + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + buf[x + (y * stride)] = rnd.Rand8() % 2 ? 255 : 0; + } + } + } + static void ExtendPlane(uint8_t *buf, int crop_width, int crop_height, int width, int height, int stride, int padding) { // Copy the outermost visible pixel to a distance of at least 'padding.' diff --git a/vp8/common/mips/mmi/idct_blk_mmi.c b/vp8/common/mips/mmi/idct_blk_mmi.c new file mode 100644 index 000000000..f6020ab46 --- /dev/null +++ b/vp8/common/mips/mmi/idct_blk_mmi.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_mem/vpx_mem.h" + +void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst, + int stride, int8_t *eobs) { + int i, j; + + for (i = 0; i < 4; i++) { + for (j = 0; j < 4; j++) { + if (*eobs++ > 1) { + vp8_dequant_idct_add_mmi(q, dq, dst, stride); + } else { + vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst, stride, dst, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dst += 4; + } + + dst += 4 * stride - 16; + } +} + +void vp8_dequant_idct_add_uv_block_mmi(int16_t *q, int16_t *dq, uint8_t *dstu, + uint8_t *dstv, int stride, + int8_t *eobs) { + int i, j; + + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + if (*eobs++ > 1) { + vp8_dequant_idct_add_mmi(q, dq, dstu, stride); + } else { + vp8_dc_only_idct_add_mmi(q[0] * dq[0], dstu, stride, dstu, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dstu += 4; + } + + dstu += 4 * stride - 8; + } + + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + if (*eobs++ > 1) { + vp8_dequant_idct_add_mmi(q, dq, dstv, stride); + } else { + vp8_dc_only_idct_add_mmi(q[0] * dq[0], dstv, stride, dstv, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dstv += 4; + } + + dstv += 4 * stride - 8; + } +} diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index be2ac0054..ece2785eb 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -28,10 +28,10 @@ add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char specialize qw/vp8_dequant_idct_add mmx neon dspr2 msa mmi/; add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs"; -specialize qw/vp8_dequant_idct_add_y_block sse2 neon dspr2 msa/; +specialize qw/vp8_dequant_idct_add_y_block sse2 neon dspr2 msa mmi/; add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs"; -specialize qw/vp8_dequant_idct_add_uv_block sse2 neon dspr2 msa/; +specialize qw/vp8_dequant_idct_add_uv_block sse2 neon dspr2 msa mmi/; # # Loopfilter diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 5813c81c4..246fe6a67 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -122,6 +122,7 @@ VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/loopfilter_filters_mmi.c VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/idctllm_mmi.c VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/dequantize_mmi.c VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/copymem_mmi.c +VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/idct_blk_mmi.c ifeq ($(CONFIG_POSTPROC),yes) VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c diff --git a/vp9/encoder/vp9_frame_scale.c b/vp9/encoder/vp9_frame_scale.c index 832df18c8..a410d0407 100644 --- a/vp9/encoder/vp9_frame_scale.c +++ b/vp9/encoder/vp9_frame_scale.c @@ -28,7 +28,7 @@ void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, const InterpKernel *const kernel = vp9_filter_kernels[filter_type]; int x, y, i; -#if HAVE_NEON +#if HAVE_SSSE3 || HAVE_NEON // TODO(linfengz): The 4:3 specialized C code is disabled by default since // it's much slower than the general version which calls vpx_scaled_2d() even // if vpx_scaled_2d() is not optimized. It will only be enabled as a reference diff --git a/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/vp9/encoder/x86/vp9_frame_scale_ssse3.c index 81e5b4229..7685e7bc3 100644 --- a/vp9/encoder/x86/vp9_frame_scale_ssse3.c +++ b/vp9/encoder/x86/vp9_frame_scale_ssse3.c @@ -438,6 +438,202 @@ static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride, } while (x); } +typedef void (*shuffle_filter_funcs)(const int16_t *const filter, + __m128i *const f); + +typedef __m128i (*convolve8_funcs)(const __m128i *const s, + const __m128i *const f); + +static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const InterpKernel *const coef, + const int phase_scaler, + uint8_t *const temp_buffer) { + static const int step_q4 = 16 * 4 / 3; + const int width_hor = (w + 5) - ((w + 5) % 6); + const int stride_hor = 2 * width_hor + 4; // store 4 extra pixels + const int width_ver = (w + 7) & ~7; + // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows + // above and (SUBPEL_TAPS / 2) extra rows below. + const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + const int height_ver = (h + 5) - ((h + 5) % 6); + int x, y = height_hor; + uint8_t *t = temp_buffer; + __m128i s[12], d[6], dd[4]; + __m128i f0[4], f1[5], f2[5]; + // The offset of the first row is always less than 1 pixel. + const int offset1_q4 = phase_scaler + 1 * step_q4; + const int offset2_q4 = phase_scaler + 2 * step_q4; + // offset_idxx indicates the pixel offset is even (0) or odd (1). + // It's used to choose the src offset and filter coefficient offset. + const int offset_idx1 = (offset1_q4 >> 4) & 1; + const int offset_idx2 = (offset2_q4 >> 4) & 1; + static const shuffle_filter_funcs shuffle_filter_funcs[2] = { + shuffle_filter_ssse3, shuffle_filter_odd_ssse3 + }; + static const convolve8_funcs convolve8_funcs[2] = { + convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3 + }; + + assert(w && h); + + shuffle_filter_ssse3(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK], f0); + shuffle_filter_funcs[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1); + shuffle_filter_funcs[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2); + + // Sub 64 to avoid overflow. + // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here. + // Coef 128 is in either fx[1] or fx[2] depending on the phase idx. + // When filter phase idx is 1, the two biggest coefficients are shuffled + // together, and the sum of them are always no less than 128. Sub 64 here. + // After the subtraction, when the sum of all positive coefficients are no + // larger than 128, and the sum of all negative coefficients are no + // less than -128, there will be no overflow in the convolve8 functions. + f0[1] = _mm_sub_epi8(f0[1], _mm_set1_epi8(64)); + f1[1 + offset_idx1] = _mm_sub_epi8(f1[1 + offset_idx1], _mm_set1_epi8(64)); + f2[1 + offset_idx2] = _mm_sub_epi8(f2[1 + offset_idx2], _mm_set1_epi8(64)); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 - 1; + + // horizontal 6x8 + do { + load_8bit_8x8(src, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + transpose_16bit_4x8(s, s); + x = width_hor; + + do { + src += 8; + load_8bit_8x8(src, src_stride, &s[4]); + // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 + // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B + // OC 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D + // 0E 0F 1E 1F 2E 2F 3E 3F 4E 4F 5E 5F 6E 6F 7E 7F + transpose_16bit_4x8(&s[4], &s[4]); + + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + d[0] = convolve8_8_even_offset_ssse3(&s[0], f0); + d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1); + d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2); + d[3] = convolve8_8_even_offset_ssse3(&s[2], f0); + d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); + d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); + + // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72 + // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 xx xx xx xx xx xx xx xx + // 05 15 25 35 45 55 65 75 xx xx xx xx xx xx xx xx + dd[0] = _mm_packus_epi16(d[0], d[2]); + dd[1] = _mm_packus_epi16(d[1], d[3]); + dd[2] = _mm_packus_epi16(d[4], d[4]); + dd[3] = _mm_packus_epi16(d[5], d[5]); + + // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 + // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73 + // 04 14 05 15 24 34 25 35 44 54 45 55 64 74 65 75 + d[0] = _mm_unpacklo_epi16(dd[0], dd[1]); + d[1] = _mm_unpackhi_epi16(dd[0], dd[1]); + d[2] = _mm_unpacklo_epi16(dd[2], dd[3]); + + // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33 + // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73 + // 04 14 05 15 xx xx xx xx 24 34 25 35 xx xx xx xx + // 44 54 45 55 xx xx xx xx 64 74 65 75 xx xx xx xx + dd[0] = _mm_unpacklo_epi32(d[0], d[1]); + dd[1] = _mm_unpackhi_epi32(d[0], d[1]); + dd[2] = _mm_unpacklo_epi32(d[2], d[2]); + dd[3] = _mm_unpackhi_epi32(d[2], d[2]); + + // 00 10 01 11 02 12 03 13 04 14 05 15 xx xx xx xx + // 20 30 21 31 22 32 23 33 24 34 25 35 xx xx xx xx + // 40 50 41 51 42 52 43 53 44 54 45 55 xx xx xx xx + // 60 70 61 71 62 72 63 73 64 74 65 75 xx xx xx xx + d[0] = _mm_unpacklo_epi64(dd[0], dd[2]); + d[1] = _mm_unpackhi_epi64(dd[0], dd[2]); + d[2] = _mm_unpacklo_epi64(dd[1], dd[3]); + d[3] = _mm_unpackhi_epi64(dd[1], dd[3]); + + // store 4 extra pixels + storeu_8bit_16x4(d, t, stride_hor); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + s[3] = s[7]; + + t += 12; + x -= 6; + } while (x); + src += 8 * src_stride - 4 * width_hor / 3; + t += 3 * stride_hor + 4; + y -= 8; + } while (y); + + // vertical 8x6 + x = width_ver; + t = temp_buffer; + do { + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + loadu_8bit_16x4(t, stride_hor, s); + y = height_ver; + + do { + // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 97 + // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 B7 + // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 D7 + // E0 F0 E1 F1 E2 F2 E3 F3 E4 F4 E5 F5 E6 F6 E7 F7 + t += 4 * stride_hor; + loadu_8bit_16x4(t, stride_hor, &s[4]); + + d[0] = convolve8_8_even_offset_ssse3(&s[0], f0); + d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1); + d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2); + d[3] = convolve8_8_even_offset_ssse3(&s[2], f0); + d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); + d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); + + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57 + d[0] = _mm_packus_epi16(d[0], d[1]); + d[2] = _mm_packus_epi16(d[2], d[3]); + d[4] = _mm_packus_epi16(d[4], d[5]); + + _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]); + _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]); + _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), d[2]); + _mm_storeh_epi64((__m128i *)(dst + 3 * dst_stride), d[2]); + _mm_storel_epi64((__m128i *)(dst + 4 * dst_stride), d[4]); + _mm_storeh_epi64((__m128i *)(dst + 5 * dst_stride), d[4]); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + s[3] = s[7]; + + dst += 6 * dst_stride; + y -= 6; + } while (y); + t -= stride_hor * 2 * height_ver / 3; + t += 16; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + static INLINE __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s, const __m128i *const f) { __m128i ss[4], temp; @@ -652,6 +848,36 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, scaled = 0; } } + } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) { + // 4 to 3 + const int buffer_stride_hor = (dst_w + 5) - ((dst_w + 5) % 6) + 2; + const int buffer_stride_ver = (dst_w + 7) & ~7; + const int buffer_height = (4 * dst_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + // When the vertical filter reads more pixels than the horizontal filter + // generated in each row, we need extra padding to avoid heap read overflow. + // For example, the horizontal filter generates 18 pixels but the vertical + // filter reads 24 pixels in a row. The difference is multiplied by 2 since + // two rows are interlaced together in the optimization. + const int extra_padding = (buffer_stride_ver > buffer_stride_hor) + ? 2 * (buffer_stride_ver - buffer_stride_hor) + : 0; + const int buffer_size = buffer_stride_hor * buffer_height + extra_padding; + uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_size); + if (temp_buffer) { + scaled = 1; + scale_plane_4_to_3_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type], phase_scaler, temp_buffer); + scale_plane_4_to_3_general(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, + vp9_filter_kernels[filter_type], phase_scaler, + temp_buffer); + scale_plane_4_to_3_general(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, + vp9_filter_kernels[filter_type], phase_scaler, + temp_buffer); + free(temp_buffer); + } } else if (dst_w == src_w * 2 && dst_h == src_h * 2 && phase_scaler == 0) { // 1 to 2 uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_w + 7) & ~7)); diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 2d071b96a..fa5feca16 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -89,6 +89,7 @@ DSP_SRCS-yes += vpx_filter.h DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/vpx_asm_stubs.c DSP_SRCS-$(HAVE_SSSE3) += x86/convolve_ssse3.h +DSP_SRCS-$(HAVE_AVX2) += x86/convolve_avx2.h DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_8t_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_bilinear_sse2.asm DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm diff --git a/vpx_dsp/x86/convolve_avx2.h b/vpx_dsp/x86/convolve_avx2.h new file mode 100644 index 000000000..c2e83b53f --- /dev/null +++ b/vpx_dsp/x86/convolve_avx2.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_X86_CONVOLVE_AVX2_H_ +#define VPX_DSP_X86_CONVOLVE_AVX2_H_ + +#include <immintrin.h> // AVX2 + +#include "./vpx_config.h" + +#if defined(__clang__) +#if (__clang_major__ > 0 && __clang_major__ < 3) || \ + (__clang_major__ == 3 && __clang_minor__ <= 3) || \ + (defined(__APPLE__) && defined(__apple_build_version__) && \ + ((__clang_major__ == 4 && __clang_minor__ <= 2) || \ + (__clang_major__ == 5 && __clang_minor__ == 0))) +#define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +#else // clang > 3.3, and not 5.0 on macosx. +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // clang <= 3.3 +#elif defined(__GNUC__) +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) +#define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 +#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) +#else // gcc > 4.7 +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // gcc <= 4.6 +#else // !(gcc || clang) +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // __clang__ + +static INLINE void shuffle_filter_avx2(const int16_t *const filter, + __m256i *const f) { + const __m256i f_values = + MM256_BROADCASTSI128_SI256(_mm_load_si128((const __m128i *)filter)); + // pack and duplicate the filter values + f[0] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0200u)); + f[1] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0604u)); + f[2] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0a08u)); + f[3] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0e0cu)); +} + +static INLINE __m256i convolve8_16_avx2(const __m256i *const s, + const __m256i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m256i k_64 = _mm256_set1_epi16(1 << 6); + const __m256i x0 = _mm256_maddubs_epi16(s[0], f[0]); + const __m256i x1 = _mm256_maddubs_epi16(s[1], f[1]); + const __m256i x2 = _mm256_maddubs_epi16(s[2], f[2]); + const __m256i x3 = _mm256_maddubs_epi16(s[3], f[3]); + // add and saturate the results together + const __m256i min_x2x1 = _mm256_min_epi16(x2, x1); + const __m256i max_x2x1 = _mm256_max_epi16(x2, x1); + __m256i temp = _mm256_adds_epi16(x0, x3); + temp = _mm256_adds_epi16(temp, min_x2x1); + temp = _mm256_adds_epi16(temp, max_x2x1); + // round and shift by 7 bit each 16 bit + temp = _mm256_adds_epi16(temp, k_64); + temp = _mm256_srai_epi16(temp, 7); + return temp; +} + +static INLINE __m128i convolve8_8_avx2(const __m256i *const s, + const __m256i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i x0 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[0]), + _mm256_castsi256_si128(f[0])); + const __m128i x1 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[1]), + _mm256_castsi256_si128(f[1])); + const __m128i x2 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[2]), + _mm256_castsi256_si128(f[2])); + const __m128i x3 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[3]), + _mm256_castsi256_si128(f[3])); + // add and saturate the results together + const __m128i min_x2x1 = _mm_min_epi16(x2, x1); + const __m128i max_x2x1 = _mm_max_epi16(x2, x1); + __m128i temp = _mm_adds_epi16(x0, x3); + temp = _mm_adds_epi16(temp, min_x2x1); + temp = _mm_adds_epi16(temp, max_x2x1); + // round and shift by 7 bit each 16 bit + temp = _mm_adds_epi16(temp, k_64); + temp = _mm_srai_epi16(temp, 7); + return temp; +} + +#undef MM256_BROADCASTSI128_SI256 + +#endif // VPX_DSP_X86_CONVOLVE_AVX2_H_ diff --git a/vpx_dsp/x86/convolve_ssse3.h b/vpx_dsp/x86/convolve_ssse3.h index b71da0e4e..8da28f0b2 100644 --- a/vpx_dsp/x86/convolve_ssse3.h +++ b/vpx_dsp/x86/convolve_ssse3.h @@ -11,6 +11,7 @@ #ifndef VPX_DSP_X86_CONVOLVE_SSSE3_H_ #define VPX_DSP_X86_CONVOLVE_SSSE3_H_ +#include <assert.h> #include <tmmintrin.h> // SSSE3 #include "./vpx_config.h" @@ -25,6 +26,20 @@ static INLINE void shuffle_filter_ssse3(const int16_t *const filter, f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); } +static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter, + __m128i *const f) { + const __m128i f_values = _mm_load_si128((const __m128i *)filter); + // pack and duplicate the filter values + // It utilizes the fact that the high byte of filter[3] is always 0 to clean + // half of f[0] and f[4]. + assert(filter[3] >= 0 && filter[3] < 256); + f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u)); + f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u)); + f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u)); + f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au)); + f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu)); +} + static INLINE __m128i convolve8_8_ssse3(const __m128i *const s, const __m128i *const f) { // multiply 2 adjacent elements with the filter and add the result @@ -45,4 +60,50 @@ static INLINE __m128i convolve8_8_ssse3(const __m128i *const s, return temp; } +static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s, + const __m128i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]); + const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); + const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); + const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); + // compensate the subtracted 64 in f[1]. x4 is always non negative. + const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64)); + // add and saturate the results together + __m128i temp = _mm_adds_epi16(x0, x3); + temp = _mm_adds_epi16(temp, x1); + temp = _mm_adds_epi16(temp, x2); + temp = _mm_adds_epi16(temp, x4); + // round and shift by 7 bit each 16 bit + temp = _mm_adds_epi16(temp, k_64); + temp = _mm_srai_epi16(temp, 7); + return temp; +} + +static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s, + const __m128i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]); + const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); + const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); + const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); + const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]); + // compensate the subtracted 64 in f[2]. x5 is always non negative. + const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64)); + __m128i temp; + + // add and saturate the results together + temp = _mm_adds_epi16(x0, x1); + temp = _mm_adds_epi16(temp, x2); + temp = _mm_adds_epi16(temp, x3); + temp = _mm_adds_epi16(temp, x4); + temp = _mm_adds_epi16(temp, x5); + // round and shift by 7 bit each 16 bit + temp = _mm_adds_epi16(temp, k_64); + temp = _mm_srai_epi16(temp, 7); + return temp; +} + #endif // VPX_DSP_X86_CONVOLVE_SSSE3_H_ diff --git a/vpx_dsp/x86/mem_sse2.h b/vpx_dsp/x86/mem_sse2.h index f9f0a48a0..2ce738fb7 100644 --- a/vpx_dsp/x86/mem_sse2.h +++ b/vpx_dsp/x86/mem_sse2.h @@ -113,4 +113,12 @@ static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d, _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]); } +static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d, + const ptrdiff_t stride) { + _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]); + _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]); + _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]); + _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]); +} + #endif // VPX_DSP_X86_MEM_SSE2_H_ diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index 4e851b58e..d0919695c 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -12,9 +12,10 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/convolve.h" +#include "vpx_dsp/x86/convolve_avx2.h" #include "vpx_ports/mem.h" -// filters for 16_h8 and 16_v8 +// filters for 16_h8 DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 @@ -35,160 +36,68 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 }; -#if defined(__clang__) -#if (__clang_major__ > 0 && __clang_major__ < 3) || \ - (__clang_major__ == 3 && __clang_minor__ <= 3) || \ - (defined(__APPLE__) && defined(__apple_build_version__) && \ - ((__clang_major__ == 4 && __clang_minor__ <= 2) || \ - (__clang_major__ == 5 && __clang_minor__ == 0))) -#define MM256_BROADCASTSI128_SI256(x) \ - _mm_broadcastsi128_si256((__m128i const *)&(x)) -#else // clang > 3.3, and not 5.0 on macosx. -#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) -#endif // clang <= 3.3 -#elif defined(__GNUC__) -#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) -#define MM256_BROADCASTSI128_SI256(x) \ - _mm_broadcastsi128_si256((__m128i const *)&(x)) -#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 -#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) -#else // gcc > 4.7 -#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) -#endif // gcc <= 4.6 -#else // !(gcc || clang) -#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) -#endif // __clang__ - -static INLINE void vpx_filter_block1d16_h8_X_avx2( +static INLINE void vpx_filter_block1d16_h8_x_avx2( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter, const int avg) { - __m128i filtersReg, outReg1, outReg2; - __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; - __m256i firstFilters, secondFilters, thirdFilters, forthFilters; - __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; - __m256i srcReg32b1, srcReg32b2, filtersReg32; + __m128i outReg1, outReg2; + __m256i outReg32b1, outReg32b2; unsigned int i; ptrdiff_t src_stride, dst_stride; + __m256i f[4], filt[4], s[4]; - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - // have the same data in both lanes of a 256 bit register - filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); - - // duplicate only the first 16 bits (first and second byte) - // across 256 bit register - firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); - // duplicate only the second 16 bits (third and forth byte) - // across 256 bit register - secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 256 bit register - thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); - // duplicate only the forth 16 bits (seventh and eighth byte) - // across 256 bit register - forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); - - filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); - filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); - filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); - filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); + shuffle_filter_avx2(filter, f); + filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); + filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2); + filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2); // multiple the size of the source and destination stride by two src_stride = src_pixels_per_line << 1; dst_stride = output_pitch << 1; for (i = output_height; i > 1; i -= 2) { + __m256i srcReg; + // load the 2 strides of source - srcReg32b1 = + srcReg = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3))); - srcReg32b1 = _mm256_inserti128_si256( - srcReg32b1, + srcReg = _mm256_inserti128_si256( + srcReg, _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)), 1); // filter the source buffer - srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); - - // add and saturate the results together - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); - - // filter the source buffer - srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); - - // add and saturate the results together - srcRegFilt32b1_1 = _mm256_adds_epi16( - srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); + s[0] = _mm256_shuffle_epi8(srcReg, filt[0]); + s[1] = _mm256_shuffle_epi8(srcReg, filt[1]); + s[2] = _mm256_shuffle_epi8(srcReg, filt[2]); + s[3] = _mm256_shuffle_epi8(srcReg, filt[3]); + outReg32b1 = convolve8_16_avx2(s, f); // reading 2 strides of the next 16 bytes // (part of it was being read by earlier read) - srcReg32b2 = + srcReg = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5))); - srcReg32b2 = _mm256_inserti128_si256( - srcReg32b2, + srcReg = _mm256_inserti128_si256( + srcReg, _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)), 1); - // add and saturate the results together - srcRegFilt32b1_1 = _mm256_adds_epi16( - srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); - // filter the source buffer - srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); + s[0] = _mm256_shuffle_epi8(srcReg, filt[0]); + s[1] = _mm256_shuffle_epi8(srcReg, filt[1]); + s[2] = _mm256_shuffle_epi8(srcReg, filt[2]); + s[3] = _mm256_shuffle_epi8(srcReg, filt[3]); + outReg32b2 = convolve8_16_avx2(s, f); - // add and saturate the results together - srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); - - // filter the source buffer - srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); - - // add and saturate the results together - srcRegFilt32b2_1 = _mm256_adds_epi16( - srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); - srcRegFilt32b2_1 = _mm256_adds_epi16( - srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); - - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); - - srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); - srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); + // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane + // contain the first and second convolve result respectively + outReg32b1 = _mm256_packus_epi16(outReg32b1, outReg32b2); src_ptr += src_stride; // average if necessary - outReg1 = _mm256_castsi256_si128(srcRegFilt32b1_1); - outReg2 = _mm256_extractf128_si256(srcRegFilt32b1_1, 1); + outReg1 = _mm256_castsi256_si128(outReg32b1); + outReg2 = _mm256_extractf128_si256(outReg32b1, 1); if (avg) { outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr)); outReg2 = _mm_avg_epu8( @@ -207,89 +116,40 @@ static INLINE void vpx_filter_block1d16_h8_X_avx2( // if the number of strides is odd. // process only 16 bytes if (i > 0) { - __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; - __m128i srcRegFilt2, srcRegFilt3; - - srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); - - // filter the source buffer - srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1_1 = - _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); - srcRegFilt2 = - _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); + __m128i srcReg; - // add and saturate the results together - srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + // load the first 16 bytes of the last row + srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer - srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = - _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); - srcRegFilt2 = - _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); - - // add and saturate the results together - srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + s[0] = _mm256_castsi128_si256( + _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0]))); + s[1] = _mm256_castsi128_si256( + _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1]))); + s[2] = _mm256_castsi128_si256( + _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2]))); + s[3] = _mm256_castsi128_si256( + _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3]))); + outReg1 = convolve8_8_avx2(s, f); // reading the next 16 bytes // (part of it was being read by earlier read) - srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); - - // add and saturate the results together - srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); - - // filter the source buffer - srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt2_1 = - _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters)); - srcRegFilt2 = - _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); - - // add and saturate the results together - srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); + srcReg = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); // filter the source buffer - srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = - _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); - srcRegFilt2 = - _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); - - // add and saturate the results together - srcRegFilt2_1 = - _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); - srcRegFilt2_1 = - _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); - - srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64)); - - srcRegFilt2_1 = - _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64)); - - // shift by 7 bit each 16 bit - srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); - srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - outReg1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); + s[0] = _mm256_castsi128_si256( + _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0]))); + s[1] = _mm256_castsi128_si256( + _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1]))); + s[2] = _mm256_castsi128_si256( + _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2]))); + s[3] = _mm256_castsi128_si256( + _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3]))); + outReg2 = convolve8_8_avx2(s, f); + + // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane + // contain the first and second convolve result respectively + outReg1 = _mm_packus_epi16(outReg1, outReg2); // average if necessary if (avg) { @@ -304,169 +164,99 @@ static INLINE void vpx_filter_block1d16_h8_X_avx2( static void vpx_filter_block1d16_h8_avx2( const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr, ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) { - vpx_filter_block1d16_h8_X_avx2(src_ptr, src_stride, output_ptr, dst_stride, + vpx_filter_block1d16_h8_x_avx2(src_ptr, src_stride, output_ptr, dst_stride, output_height, filter, 0); } static void vpx_filter_block1d16_h8_avg_avx2( const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr, ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) { - vpx_filter_block1d16_h8_X_avx2(src_ptr, src_stride, output_ptr, dst_stride, + vpx_filter_block1d16_h8_x_avx2(src_ptr, src_stride, output_ptr, dst_stride, output_height, filter, 1); } -static INLINE void vpx_filter_block1d16_v8_X_avx2( +static INLINE void vpx_filter_block1d16_v8_x_avx2( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter, const int avg) { - __m128i filtersReg, outReg1, outReg2; - __m256i addFilterReg64; - __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; - __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; - __m256i srcReg32b11, srcReg32b12, filtersReg32; - __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + __m128i outReg1, outReg2; + __m256i srcRegHead1; unsigned int i; ptrdiff_t src_stride, dst_stride; + __m256i f[4], s1[4], s2[4]; - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the - // same data in both lanes of 128 bit register. - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - // have the same data in both lanes of a 256 bit register - filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); - - // duplicate only the first 16 bits (first and second byte) - // across 256 bit register - firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); - // duplicate only the second 16 bits (third and forth byte) - // across 256 bit register - secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 256 bit register - thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); - // duplicate only the forth 16 bits (seventh and eighth byte) - // across 256 bit register - forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); + shuffle_filter_avx2(filter, f); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; - // load 16 bytes 7 times in stride of src_pitch - srcReg32b1 = - _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr))); - srcReg32b2 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); - srcReg32b3 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); - srcReg32b4 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); - srcReg32b5 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); - srcReg32b6 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); - srcReg32b7 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); - - // have each consecutive loads on the same 256 register - srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, - _mm256_castsi256_si128(srcReg32b2), 1); - srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, - _mm256_castsi256_si128(srcReg32b3), 1); - srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, - _mm256_castsi256_si128(srcReg32b4), 1); - srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, - _mm256_castsi256_si128(srcReg32b5), 1); - srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, - _mm256_castsi256_si128(srcReg32b6), 1); - srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, - _mm256_castsi256_si128(srcReg32b7), 1); - - // merge every two consecutive registers except the last one - srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); - srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); - - // save - srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); - - // save - srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); - - // save - srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); - - // save - srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); + { + __m128i s[6]; + __m256i s32b[6]; + + // load 16 bytes 7 times in stride of src_pitch + s[0] = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_pitch)); + s[1] = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_pitch)); + s[2] = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_pitch)); + s[3] = _mm_loadu_si128((const __m128i *)(src_ptr + 3 * src_pitch)); + s[4] = _mm_loadu_si128((const __m128i *)(src_ptr + 4 * src_pitch)); + s[5] = _mm_loadu_si128((const __m128i *)(src_ptr + 5 * src_pitch)); + srcRegHead1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + 6 * src_pitch))); + + // have each consecutive loads on the same 256 register + s32b[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[1], 1); + s32b[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[2], 1); + s32b[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[3], 1); + s32b[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[4], 1); + s32b[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[5], 1); + s32b[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), + _mm256_castsi256_si128(srcRegHead1), 1); + + // merge every two consecutive registers except the last one + // the first lanes contain values for filtering odd rows (1,3,5...) and + // the second lanes contain values for filtering even rows (2,4,6...) + s1[0] = _mm256_unpacklo_epi8(s32b[0], s32b[1]); + s2[0] = _mm256_unpackhi_epi8(s32b[0], s32b[1]); + s1[1] = _mm256_unpacklo_epi8(s32b[2], s32b[3]); + s2[1] = _mm256_unpackhi_epi8(s32b[2], s32b[3]); + s1[2] = _mm256_unpacklo_epi8(s32b[4], s32b[5]); + s2[2] = _mm256_unpackhi_epi8(s32b[4], s32b[5]); + } for (i = output_height; i > 1; i -= 2) { - // load the last 2 loads of 16 bytes and have every two + __m256i srcRegHead2, srcRegHead3; + + // load the next 2 loads of 16 bytes and have every two // consecutive loads in the same 256 bit register - srcReg32b8 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); - srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, - _mm256_castsi256_si128(srcReg32b8), 1); - srcReg32b9 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); - srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, - _mm256_castsi256_si128(srcReg32b9), 1); - - // merge every two consecutive registers - // save - srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); - srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); - srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); - - // add and saturate the results together - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); - srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); - - // add and saturate the results together - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, - _mm256_min_epi16(srcReg32b8, srcReg32b12)); - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, - _mm256_max_epi16(srcReg32b8, srcReg32b12)); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); - srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); - - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); - srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); - - // add and saturate the results together - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, - _mm256_min_epi16(srcReg32b8, srcReg32b12)); - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, - _mm256_max_epi16(srcReg32b8, srcReg32b12)); - - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); - srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); + srcRegHead2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + 7 * src_pitch))); + srcRegHead1 = _mm256_inserti128_si256( + srcRegHead1, _mm256_castsi256_si128(srcRegHead2), 1); + srcRegHead3 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + 8 * src_pitch))); + srcRegHead2 = _mm256_inserti128_si256( + srcRegHead2, _mm256_castsi256_si128(srcRegHead3), 1); + + // merge the two new consecutive registers + // the first lane contain values for filtering odd rows (1,3,5...) and + // the second lane contain values for filtering even rows (2,4,6...) + s1[3] = _mm256_unpacklo_epi8(srcRegHead1, srcRegHead2); + s2[3] = _mm256_unpackhi_epi8(srcRegHead1, srcRegHead2); + + s1[0] = convolve8_16_avx2(s1, f); + s2[0] = convolve8_16_avx2(s2, f); + + // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane + // contain the first and second convolve result respectively + s1[0] = _mm256_packus_epi16(s1[0], s2[0]); src_ptr += src_stride; // average if necessary - outReg1 = _mm256_castsi256_si128(srcReg32b1); - outReg2 = _mm256_extractf128_si256(srcReg32b1, 1); + outReg1 = _mm256_castsi256_si128(s1[0]); + outReg2 = _mm256_extractf128_si256(s1[0], 1); if (avg) { outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr)); outReg2 = _mm_avg_epu8( @@ -481,78 +271,35 @@ static INLINE void vpx_filter_block1d16_v8_X_avx2( output_ptr += dst_stride; - // save part of the registers for next strides - srcReg32b10 = srcReg32b11; - srcReg32b1 = srcReg32b3; - srcReg32b11 = srcReg32b2; - srcReg32b3 = srcReg32b5; - srcReg32b2 = srcReg32b4; - srcReg32b5 = srcReg32b7; - srcReg32b7 = srcReg32b9; + // shift down by two rows + s1[0] = s1[1]; + s2[0] = s2[1]; + s1[1] = s1[2]; + s2[1] = s2[2]; + s1[2] = s1[3]; + s2[2] = s2[3]; + srcRegHead1 = srcRegHead3; } + + // if the number of strides is odd. + // process only 16 bytes if (i > 0) { - __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; - __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; // load the last 16 bytes - srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); + const __m128i srcRegHead2 = + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); // merge the last 2 results together - srcRegFilt4 = - _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); - srcRegFilt7 = - _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), - _mm256_castsi256_si128(firstFilters)); - srcRegFilt4 = - _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); - srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), - _mm256_castsi256_si128(firstFilters)); - srcRegFilt7 = - _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters)); - - // add and saturate the results together - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); - srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), - _mm256_castsi256_si128(secondFilters)); - srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), - _mm256_castsi256_si128(secondFilters)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), - _mm256_castsi256_si128(thirdFilters)); - srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), - _mm256_castsi256_si128(thirdFilters)); - - // add and saturate the results together - srcRegFilt1 = - _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6)); - srcRegFilt3 = - _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7)); - - // add and saturate the results together - srcRegFilt1 = - _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6)); - srcRegFilt3 = - _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7)); - - srcRegFilt1 = - _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64)); - srcRegFilt3 = - _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64)); - - // shift by 7 bit each 16 bit - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); - srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - outReg1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); + s1[0] = _mm256_castsi128_si256( + _mm_unpacklo_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2)); + s2[0] = _mm256_castsi128_si256( + _mm_unpackhi_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2)); + + outReg1 = convolve8_8_avx2(s1, f); + outReg2 = convolve8_8_avx2(s2, f); + + // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane + // contain the first and second convolve result respectively + outReg1 = _mm_packus_epi16(outReg1, outReg2); // average if necessary if (avg) { @@ -568,14 +315,14 @@ static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *dst_ptr, ptrdiff_t dst_stride, uint32_t height, const int16_t *filter) { - vpx_filter_block1d16_v8_X_avx2(src_ptr, src_stride, dst_ptr, dst_stride, + vpx_filter_block1d16_v8_x_avx2(src_ptr, src_stride, dst_ptr, dst_stride, height, filter, 0); } static void vpx_filter_block1d16_v8_avg_avx2( const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *dst_ptr, ptrdiff_t dst_stride, uint32_t height, const int16_t *filter) { - vpx_filter_block1d16_v8_X_avx2(src_ptr, src_stride, dst_ptr, dst_stride, + vpx_filter_block1d16_v8_x_avx2(src_ptr, src_stride, dst_ptr, dst_stride, height, filter, 1); } |