Avoid LD2/ST2 instructions in highbd v predictors in Neon

The interleaving load/store instructions (LD2/LD3/LD4 and ST2/ST3/ST4) are useful if we are dealing with interleaved data (e.g. real/imag components of complex numbers), but for simply loading or storing larger quantities of data it is preferable to simply use the normal load/store instructions. This patch replaces such occurrences in the two larger block sizes: vpx_highbd_v_predictor_16x16_neon and vpx_highbd_v_predictor_32x32_neon. Change-Id: Ie4ffa298a2466ceaf893566fd0aefe3f66f439e4
author: George Steed <george.steed@arm.com> 2023-03-22 11:49:33 +0000
committer: George Steed <george.steed@arm.com> 2023-03-29 08:39:35 +0000
commit: 9824167ad292ee42c9c97f3e6ce1d9ca90bf679f (patch)
tree: d0e5fba87fa7e9274398021eb30bca24806c4827 /vpx_dsp
parent: 83def747ff316d283c949458a4b890b23e5e0b8b (diff)
download: libvpx-9824167ad292ee42c9c97f3e6ce1d9ca90bf679f.tar
libvpx-9824167ad292ee42c9c97f3e6ce1d9ca90bf679f.tar.gz
libvpx-9824167ad292ee42c9c97f3e6ce1d9ca90bf679f.tar.bz2
libvpx-9824167ad292ee42c9c97f3e6ce1d9ca90bf679f.zip
1 files changed, 15 insertions, 9 deletions
diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c
index b2aea14f7..ec97094be 100644
--- a/vpx_dsp/arm/highbd_intrapred_neon.c
+++ b/vpx_dsp/arm/highbd_intrapred_neon.c
@@ -2166,30 +2166,36 @@ void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
                                        const uint16_t *above,
                                        const uint16_t *left, int bd) {
-  const uint16x8x2_t row = vld2q_u16(above);
+  const uint16x8_t row0 = vld1q_u16(above + 0);
+  const uint16x8_t row1 = vld1q_u16(above + 8);
   int i;
   (void)left;
   (void)bd;
 
-  for (i = 0; i < 16; i++, dst += stride) {
-    vst2q_u16(dst, row);
+  for (i = 0; i < 16; i++) {
+    vst1q_u16(dst + 0, row0);
+    vst1q_u16(dst + 8, row1);
+    dst += stride;
   }
 }
 
 void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
                                        const uint16_t *above,
                                        const uint16_t *left, int bd) {
-  const uint16x8x2_t row0 = vld2q_u16(above);
-  const uint16x8x2_t row1 = vld2q_u16(above + 16);
+  const uint16x8_t row0 = vld1q_u16(above + 0);
+  const uint16x8_t row1 = vld1q_u16(above + 8);
+  const uint16x8_t row2 = vld1q_u16(above + 16);
+  const uint16x8_t row3 = vld1q_u16(above + 24);
   int i;
   (void)left;
   (void)bd;
 
   for (i = 0; i < 32; i++) {
-    vst2q_u16(dst, row0);
-    dst += 16;
-    vst2q_u16(dst, row1);
-    dst += stride - 16;
+    vst1q_u16(dst + 0, row0);
+    vst1q_u16(dst + 8, row1);
+    vst1q_u16(dst + 16, row2);
+    vst1q_u16(dst + 24, row3);
+    dst += stride;
   }
 }
author	George Steed <george.steed@arm.com>	2023-03-22 11:49:33 +0000
committer	George Steed <george.steed@arm.com>	2023-03-29 08:39:35 +0000
commit	9824167ad292ee42c9c97f3e6ce1d9ca90bf679f (patch)
tree	d0e5fba87fa7e9274398021eb30bca24806c4827 /vpx_dsp
parent	83def747ff316d283c949458a4b890b23e5e0b8b (diff)
download	libvpx-9824167ad292ee42c9c97f3e6ce1d9ca90bf679f.tar libvpx-9824167ad292ee42c9c97f3e6ce1d9ca90bf679f.tar.gz libvpx-9824167ad292ee42c9c97f3e6ce1d9ca90bf679f.tar.bz2 libvpx-9824167ad292ee42c9c97f3e6ce1d9ca90bf679f.zip