diff options
author | George Steed <george.steed@arm.com> | 2023-03-21 14:31:50 +0000 |
---|---|---|
committer | George Steed <george.steed@arm.com> | 2023-03-29 08:39:35 +0000 |
commit | 4cf9819282aa123e8b126731ef5629ee5144cd86 (patch) | |
tree | 531d2b6bedb504ef71c73b213242a918b652acf0 /vpx_dsp | |
parent | 5b05f6f3a01d7e25d0573b482245a2b8b0eb09bd (diff) | |
download | libvpx-4cf9819282aa123e8b126731ef5629ee5144cd86.tar libvpx-4cf9819282aa123e8b126731ef5629ee5144cd86.tar.gz libvpx-4cf9819282aa123e8b126731ef5629ee5144cd86.tar.bz2 libvpx-4cf9819282aa123e8b126731ef5629ee5144cd86.zip |
Avoid LD2/ST2 instructions in vpx_dc_predictor_32x32_neon
The LD2 and ST2 instructions are useful if we are dealing with
interleaved data (e.g. real/imag components of complex numbers), but for
simply loading or storing larger quantities of data it is preferable to
simply use two of the normal load/store instructions.
This patch replaces such occurrences in vpx_dc_predictor_32x32_neon and
related functions.
With Clang-15 this speeds up this function by 10-30% depending on the
micro-architecture being benchmarked on. With GCC-12 this speeds up the
function by 40-60% depending on the micro-architecture being benchmarked
on.
Change-Id: I670dc37908aa238f360104efd74d6c2108ecf945
Diffstat (limited to 'vpx_dsp')
-rw-r--r-- | vpx_dsp/arm/intrapred_neon.c | 28 |
1 files changed, 15 insertions, 13 deletions
diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c index 892310f15..b7f2a11ca 100644 --- a/vpx_dsp/arm/intrapred_neon.c +++ b/vpx_dsp/arm/intrapred_neon.c @@ -193,9 +193,10 @@ void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, // DC 32x32 static INLINE uint16x4_t dc_sum_32(const uint8_t *ref) { - const uint8x16x2_t r = vld2q_u8(ref); - const uint16x8_t p0 = vpaddlq_u8(r.val[0]); - const uint16x8_t p1 = vpaddlq_u8(r.val[1]); + const uint8x16_t r0 = vld1q_u8(ref + 0); + const uint8x16_t r1 = vld1q_u8(ref + 16); + const uint16x8_t p0 = vpaddlq_u8(r0); + const uint16x8_t p1 = vpaddlq_u8(r1); const uint16x8_t p2 = vaddq_u16(p0, p1); uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); sum = vpadd_u16(sum, sum); @@ -204,23 +205,24 @@ static INLINE uint16x4_t dc_sum_32(const uint8_t *ref) { static INLINE void dc_store_32x32(uint8_t *dst, ptrdiff_t stride, const uint8x8_t dc) { - uint8x16x2_t dc_dup; + uint8x16_t dc_dup = vdupq_lane_u8(dc, 0); int i; - dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u8(dc, 0); - for (i = 0; i < 32; ++i, dst += stride) { - vst2q_u8(dst, dc_dup); + vst1q_u8(dst + 0, dc_dup); + vst1q_u8(dst + 16, dc_dup); } } void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x16x2_t a = vld2q_u8(above); - const uint8x16x2_t l = vld2q_u8(left); - const uint16x8_t pa0 = vpaddlq_u8(a.val[0]); - const uint16x8_t pl0 = vpaddlq_u8(l.val[0]); - const uint16x8_t pa1 = vpaddlq_u8(a.val[1]); - const uint16x8_t pl1 = vpaddlq_u8(l.val[1]); + const uint8x16_t a0 = vld1q_u8(above + 0); + const uint8x16_t a1 = vld1q_u8(above + 16); + const uint8x16_t l0 = vld1q_u8(left + 0); + const uint8x16_t l1 = vld1q_u8(left + 16); + const uint16x8_t pa0 = vpaddlq_u8(a0); + const uint16x8_t pl0 = vpaddlq_u8(l0); + const uint16x8_t pa1 = vpaddlq_u8(a1); + const uint16x8_t pl1 = vpaddlq_u8(l1); const uint16x8_t pa = vaddq_u16(pa0, pa1); const uint16x8_t pl = vaddq_u16(pl0, pl1); const uint16x8_t pal = vaddq_u16(pa, pl); |