summaryrefslogtreecommitdiff
path: root/vpx_dsp
diff options
context:
space:
mode:
authorGeorge Steed <george.steed@arm.com>2023-03-22 11:49:33 +0000
committerGeorge Steed <george.steed@arm.com>2023-03-29 08:39:35 +0000
commit9824167ad292ee42c9c97f3e6ce1d9ca90bf679f (patch)
treed0e5fba87fa7e9274398021eb30bca24806c4827 /vpx_dsp
parent83def747ff316d283c949458a4b890b23e5e0b8b (diff)
downloadlibvpx-9824167ad292ee42c9c97f3e6ce1d9ca90bf679f.tar
libvpx-9824167ad292ee42c9c97f3e6ce1d9ca90bf679f.tar.gz
libvpx-9824167ad292ee42c9c97f3e6ce1d9ca90bf679f.tar.bz2
libvpx-9824167ad292ee42c9c97f3e6ce1d9ca90bf679f.zip
Avoid LD2/ST2 instructions in highbd v predictors in Neon
The interleaving load/store instructions (LD2/LD3/LD4 and ST2/ST3/ST4) are useful if we are dealing with interleaved data (e.g. real/imag components of complex numbers), but for simply loading or storing larger quantities of data it is preferable to simply use the normal load/store instructions. This patch replaces such occurrences in the two larger block sizes: vpx_highbd_v_predictor_16x16_neon and vpx_highbd_v_predictor_32x32_neon. Change-Id: Ie4ffa298a2466ceaf893566fd0aefe3f66f439e4
Diffstat (limited to 'vpx_dsp')
-rw-r--r--vpx_dsp/arm/highbd_intrapred_neon.c24
1 files changed, 15 insertions, 9 deletions
diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c
index b2aea14f7..ec97094be 100644
--- a/vpx_dsp/arm/highbd_intrapred_neon.c
+++ b/vpx_dsp/arm/highbd_intrapred_neon.c
@@ -2166,30 +2166,36 @@ void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
- const uint16x8x2_t row = vld2q_u16(above);
+ const uint16x8_t row0 = vld1q_u16(above + 0);
+ const uint16x8_t row1 = vld1q_u16(above + 8);
int i;
(void)left;
(void)bd;
- for (i = 0; i < 16; i++, dst += stride) {
- vst2q_u16(dst, row);
+ for (i = 0; i < 16; i++) {
+ vst1q_u16(dst + 0, row0);
+ vst1q_u16(dst + 8, row1);
+ dst += stride;
}
}
void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
- const uint16x8x2_t row0 = vld2q_u16(above);
- const uint16x8x2_t row1 = vld2q_u16(above + 16);
+ const uint16x8_t row0 = vld1q_u16(above + 0);
+ const uint16x8_t row1 = vld1q_u16(above + 8);
+ const uint16x8_t row2 = vld1q_u16(above + 16);
+ const uint16x8_t row3 = vld1q_u16(above + 24);
int i;
(void)left;
(void)bd;
for (i = 0; i < 32; i++) {
- vst2q_u16(dst, row0);
- dst += 16;
- vst2q_u16(dst, row1);
- dst += stride - 16;
+ vst1q_u16(dst + 0, row0);
+ vst1q_u16(dst + 8, row1);
+ vst1q_u16(dst + 16, row2);
+ vst1q_u16(dst + 24, row3);
+ dst += stride;
}
}