Avoid LD2/ST2 instructions in vpx_dc_predictor_32x32_neon

The LD2 and ST2 instructions are useful if we are dealing with interleaved data (e.g. real/imag components of complex numbers), but for simply loading or storing larger quantities of data it is preferable to simply use two of the normal load/store instructions. This patch replaces such occurrences in vpx_dc_predictor_32x32_neon and related functions. With Clang-15 this speeds up this function by 10-30% depending on the micro-architecture being benchmarked on. With GCC-12 this speeds up the function by 40-60% depending on the micro-architecture being benchmarked on. Change-Id: I670dc37908aa238f360104efd74d6c2108ecf945
author: George Steed <george.steed@arm.com> 2023-03-21 14:31:50 +0000
committer: George Steed <george.steed@arm.com> 2023-03-29 08:39:35 +0000
commit: 4cf9819282aa123e8b126731ef5629ee5144cd86 (patch)
tree: 531d2b6bedb504ef71c73b213242a918b652acf0 /vpx_dsp
parent: 5b05f6f3a01d7e25d0573b482245a2b8b0eb09bd (diff)
download: libvpx-4cf9819282aa123e8b126731ef5629ee5144cd86.tar
libvpx-4cf9819282aa123e8b126731ef5629ee5144cd86.tar.gz
libvpx-4cf9819282aa123e8b126731ef5629ee5144cd86.tar.bz2
libvpx-4cf9819282aa123e8b126731ef5629ee5144cd86.zip
1 files changed, 15 insertions, 13 deletions
diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c
index 892310f15..b7f2a11ca 100644
--- a/vpx_dsp/arm/intrapred_neon.c
+++ b/vpx_dsp/arm/intrapred_neon.c
@@ -193,9 +193,10 @@ void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
 // DC 32x32
 
 static INLINE uint16x4_t dc_sum_32(const uint8_t *ref) {
-  const uint8x16x2_t r = vld2q_u8(ref);
-  const uint16x8_t p0 = vpaddlq_u8(r.val[0]);
-  const uint16x8_t p1 = vpaddlq_u8(r.val[1]);
+  const uint8x16_t r0 = vld1q_u8(ref + 0);
+  const uint8x16_t r1 = vld1q_u8(ref + 16);
+  const uint16x8_t p0 = vpaddlq_u8(r0);
+  const uint16x8_t p1 = vpaddlq_u8(r1);
   const uint16x8_t p2 = vaddq_u16(p0, p1);
   uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
   sum = vpadd_u16(sum, sum);
@@ -204,23 +205,24 @@ static INLINE uint16x4_t dc_sum_32(const uint8_t *ref) {
 
 static INLINE void dc_store_32x32(uint8_t *dst, ptrdiff_t stride,
                                   const uint8x8_t dc) {
-  uint8x16x2_t dc_dup;
+  uint8x16_t dc_dup = vdupq_lane_u8(dc, 0);
   int i;
-  dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u8(dc, 0);
-
   for (i = 0; i < 32; ++i, dst += stride) {
-    vst2q_u8(dst, dc_dup);
+    vst1q_u8(dst + 0, dc_dup);
+    vst1q_u8(dst + 16, dc_dup);
   }
 }
 
 void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  const uint8x16x2_t a = vld2q_u8(above);
-  const uint8x16x2_t l = vld2q_u8(left);
-  const uint16x8_t pa0 = vpaddlq_u8(a.val[0]);
-  const uint16x8_t pl0 = vpaddlq_u8(l.val[0]);
-  const uint16x8_t pa1 = vpaddlq_u8(a.val[1]);
-  const uint16x8_t pl1 = vpaddlq_u8(l.val[1]);
+  const uint8x16_t a0 = vld1q_u8(above + 0);
+  const uint8x16_t a1 = vld1q_u8(above + 16);
+  const uint8x16_t l0 = vld1q_u8(left + 0);
+  const uint8x16_t l1 = vld1q_u8(left + 16);
+  const uint16x8_t pa0 = vpaddlq_u8(a0);
+  const uint16x8_t pl0 = vpaddlq_u8(l0);
+  const uint16x8_t pa1 = vpaddlq_u8(a1);
+  const uint16x8_t pl1 = vpaddlq_u8(l1);
   const uint16x8_t pa = vaddq_u16(pa0, pa1);
   const uint16x8_t pl = vaddq_u16(pl0, pl1);
   const uint16x8_t pal = vaddq_u16(pa, pl);
author	George Steed <george.steed@arm.com>	2023-03-21 14:31:50 +0000
committer	George Steed <george.steed@arm.com>	2023-03-29 08:39:35 +0000
commit	4cf9819282aa123e8b126731ef5629ee5144cd86 (patch)
tree	531d2b6bedb504ef71c73b213242a918b652acf0 /vpx_dsp
parent	5b05f6f3a01d7e25d0573b482245a2b8b0eb09bd (diff)
download	libvpx-4cf9819282aa123e8b126731ef5629ee5144cd86.tar libvpx-4cf9819282aa123e8b126731ef5629ee5144cd86.tar.gz libvpx-4cf9819282aa123e8b126731ef5629ee5144cd86.tar.bz2 libvpx-4cf9819282aa123e8b126731ef5629ee5144cd86.zip