diff options
author | James Zern <jzern@google.com> | 2022-09-01 18:47:50 -0700 |
---|---|---|
committer | James Zern <jzern@google.com> | 2022-09-01 18:47:50 -0700 |
commit | 281dfae8353940fe380c73384607ec11a5c53f43 (patch) | |
tree | 97c71966a24e14aa53f10b6147a0569e2d573e54 /vpx_dsp | |
parent | 028fc1b50f196cab1ec93816654fbefe64f20cf3 (diff) | |
download | libvpx-281dfae8353940fe380c73384607ec11a5c53f43.tar libvpx-281dfae8353940fe380c73384607ec11a5c53f43.tar.gz libvpx-281dfae8353940fe380c73384607ec11a5c53f43.tar.bz2 libvpx-281dfae8353940fe380c73384607ec11a5c53f43.zip |
neon,load_unaligned_*: use dup for lane 0
this produces better assembly with gcc (11.3.0-3); no change in assembly
using clang from the r24 android sdk (Android (8075178, based on
r437112b) clang version 14.0.1
(https://android.googlesource.com/toolchain/llvm-project
8671348b81b95fc603505dfc881b45103bee1731)
Change-Id: Ifec252d4f499f23be1cd94aa8516caf6b3fbbc11
Diffstat (limited to 'vpx_dsp')
-rw-r--r-- | vpx_dsp/arm/mem_neon.h | 8 | ||||
-rw-r--r-- | vpx_dsp/arm/sad4d_neon.c | 4 |
2 files changed, 6 insertions, 6 deletions
diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h index 50aaa94fe..84aae161b 100644 --- a/vpx_dsp/arm/mem_neon.h +++ b/vpx_dsp/arm/mem_neon.h @@ -116,11 +116,11 @@ static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) { static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, ptrdiff_t stride) { uint32_t a; - uint32x2_t a_u32 = vdup_n_u32(0); + uint32x2_t a_u32; if (stride == 4) return vld1_u8(buf); memcpy(&a, buf, 4); buf += stride; - a_u32 = vset_lane_u32(a, a_u32, 0); + a_u32 = vdup_n_u32(a); memcpy(&a, buf, 4); a_u32 = vset_lane_u32(a, a_u32, 1); return vreinterpret_u8_u32(a_u32); @@ -143,11 +143,11 @@ static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride, static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, ptrdiff_t stride) { uint32_t a; - uint32x4_t a_u32 = vdupq_n_u32(0); + uint32x4_t a_u32; if (stride == 4) return vld1q_u8(buf); memcpy(&a, buf, 4); buf += stride; - a_u32 = vsetq_lane_u32(a, a_u32, 0); + a_u32 = vdupq_n_u32(a); memcpy(&a, buf, 4); buf += stride; a_u32 = vsetq_lane_u32(a, a_u32, 1); diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c index 03f716c3d..53866296c 100644 --- a/vpx_dsp/arm/sad4d_neon.c +++ b/vpx_dsp/arm/sad4d_neon.c @@ -20,9 +20,9 @@ static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0, const void *const buf1) { uint32_t a; - uint32x2_t aa = vdup_n_u32(0); + uint32x2_t aa; memcpy(&a, buf0, 4); - aa = vset_lane_u32(a, aa, 0); + aa = vdup_n_u32(a); memcpy(&a, buf1, 4); aa = vset_lane_u32(a, aa, 1); return vreinterpret_u8_u32(aa); |