diff options
author | George Steed <george.steed@arm.com> | 2023-03-22 11:49:33 +0000 |
---|---|---|
committer | George Steed <george.steed@arm.com> | 2023-03-29 08:39:35 +0000 |
commit | 9824167ad292ee42c9c97f3e6ce1d9ca90bf679f (patch) | |
tree | d0e5fba87fa7e9274398021eb30bca24806c4827 /vpx_dsp | |
parent | 83def747ff316d283c949458a4b890b23e5e0b8b (diff) | |
download | libvpx-9824167ad292ee42c9c97f3e6ce1d9ca90bf679f.tar libvpx-9824167ad292ee42c9c97f3e6ce1d9ca90bf679f.tar.gz libvpx-9824167ad292ee42c9c97f3e6ce1d9ca90bf679f.tar.bz2 libvpx-9824167ad292ee42c9c97f3e6ce1d9ca90bf679f.zip |
Avoid LD2/ST2 instructions in highbd v predictors in Neon
The interleaving load/store instructions (LD2/LD3/LD4 and ST2/ST3/ST4)
are useful if we are dealing with interleaved data (e.g. real/imag
components of complex numbers), but for simply loading or storing larger
quantities of data it is preferable to simply use the normal load/store
instructions.
This patch replaces such occurrences in the two larger block sizes:
vpx_highbd_v_predictor_16x16_neon and vpx_highbd_v_predictor_32x32_neon.
Change-Id: Ie4ffa298a2466ceaf893566fd0aefe3f66f439e4
Diffstat (limited to 'vpx_dsp')
-rw-r--r-- | vpx_dsp/arm/highbd_intrapred_neon.c | 24 |
1 files changed, 15 insertions, 9 deletions
diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c index b2aea14f7..ec97094be 100644 --- a/vpx_dsp/arm/highbd_intrapred_neon.c +++ b/vpx_dsp/arm/highbd_intrapred_neon.c @@ -2166,30 +2166,36 @@ void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8x2_t row = vld2q_u16(above); + const uint16x8_t row0 = vld1q_u16(above + 0); + const uint16x8_t row1 = vld1q_u16(above + 8); int i; (void)left; (void)bd; - for (i = 0; i < 16; i++, dst += stride) { - vst2q_u16(dst, row); + for (i = 0; i < 16; i++) { + vst1q_u16(dst + 0, row0); + vst1q_u16(dst + 8, row1); + dst += stride; } } void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8x2_t row0 = vld2q_u16(above); - const uint16x8x2_t row1 = vld2q_u16(above + 16); + const uint16x8_t row0 = vld1q_u16(above + 0); + const uint16x8_t row1 = vld1q_u16(above + 8); + const uint16x8_t row2 = vld1q_u16(above + 16); + const uint16x8_t row3 = vld1q_u16(above + 24); int i; (void)left; (void)bd; for (i = 0; i < 32; i++) { - vst2q_u16(dst, row0); - dst += 16; - vst2q_u16(dst, row1); - dst += stride - 16; + vst1q_u16(dst + 0, row0); + vst1q_u16(dst + 8, row1); + vst1q_u16(dst + 16, row2); + vst1q_u16(dst + 24, row3); + dst += stride; } } |