summaryrefslogtreecommitdiff
path: root/vpx_dsp
diff options
context:
space:
mode:
authorGeorge Steed <george.steed@arm.com>2023-03-21 14:31:50 +0000
committerGeorge Steed <george.steed@arm.com>2023-03-29 08:39:35 +0000
commit4cf9819282aa123e8b126731ef5629ee5144cd86 (patch)
tree531d2b6bedb504ef71c73b213242a918b652acf0 /vpx_dsp
parent5b05f6f3a01d7e25d0573b482245a2b8b0eb09bd (diff)
downloadlibvpx-4cf9819282aa123e8b126731ef5629ee5144cd86.tar
libvpx-4cf9819282aa123e8b126731ef5629ee5144cd86.tar.gz
libvpx-4cf9819282aa123e8b126731ef5629ee5144cd86.tar.bz2
libvpx-4cf9819282aa123e8b126731ef5629ee5144cd86.zip
Avoid LD2/ST2 instructions in vpx_dc_predictor_32x32_neon
The LD2 and ST2 instructions are useful if we are dealing with interleaved data (e.g. real/imag components of complex numbers), but for simply loading or storing larger quantities of data it is preferable to simply use two of the normal load/store instructions. This patch replaces such occurrences in vpx_dc_predictor_32x32_neon and related functions. With Clang-15 this speeds up this function by 10-30% depending on the micro-architecture being benchmarked on. With GCC-12 this speeds up the function by 40-60% depending on the micro-architecture being benchmarked on. Change-Id: I670dc37908aa238f360104efd74d6c2108ecf945
Diffstat (limited to 'vpx_dsp')
-rw-r--r--vpx_dsp/arm/intrapred_neon.c28
1 files changed, 15 insertions, 13 deletions
diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c
index 892310f15..b7f2a11ca 100644
--- a/vpx_dsp/arm/intrapred_neon.c
+++ b/vpx_dsp/arm/intrapred_neon.c
@@ -193,9 +193,10 @@ void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
// DC 32x32
static INLINE uint16x4_t dc_sum_32(const uint8_t *ref) {
- const uint8x16x2_t r = vld2q_u8(ref);
- const uint16x8_t p0 = vpaddlq_u8(r.val[0]);
- const uint16x8_t p1 = vpaddlq_u8(r.val[1]);
+ const uint8x16_t r0 = vld1q_u8(ref + 0);
+ const uint8x16_t r1 = vld1q_u8(ref + 16);
+ const uint16x8_t p0 = vpaddlq_u8(r0);
+ const uint16x8_t p1 = vpaddlq_u8(r1);
const uint16x8_t p2 = vaddq_u16(p0, p1);
uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
sum = vpadd_u16(sum, sum);
@@ -204,23 +205,24 @@ static INLINE uint16x4_t dc_sum_32(const uint8_t *ref) {
static INLINE void dc_store_32x32(uint8_t *dst, ptrdiff_t stride,
const uint8x8_t dc) {
- uint8x16x2_t dc_dup;
+ uint8x16_t dc_dup = vdupq_lane_u8(dc, 0);
int i;
- dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u8(dc, 0);
-
for (i = 0; i < 32; ++i, dst += stride) {
- vst2q_u8(dst, dc_dup);
+ vst1q_u8(dst + 0, dc_dup);
+ vst1q_u8(dst + 16, dc_dup);
}
}
void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- const uint8x16x2_t a = vld2q_u8(above);
- const uint8x16x2_t l = vld2q_u8(left);
- const uint16x8_t pa0 = vpaddlq_u8(a.val[0]);
- const uint16x8_t pl0 = vpaddlq_u8(l.val[0]);
- const uint16x8_t pa1 = vpaddlq_u8(a.val[1]);
- const uint16x8_t pl1 = vpaddlq_u8(l.val[1]);
+ const uint8x16_t a0 = vld1q_u8(above + 0);
+ const uint8x16_t a1 = vld1q_u8(above + 16);
+ const uint8x16_t l0 = vld1q_u8(left + 0);
+ const uint8x16_t l1 = vld1q_u8(left + 16);
+ const uint16x8_t pa0 = vpaddlq_u8(a0);
+ const uint16x8_t pl0 = vpaddlq_u8(l0);
+ const uint16x8_t pa1 = vpaddlq_u8(a1);
+ const uint16x8_t pl1 = vpaddlq_u8(l1);
const uint16x8_t pa = vaddq_u16(pa0, pa1);
const uint16x8_t pl = vaddq_u16(pl0, pl1);
const uint16x8_t pal = vaddq_u16(pa, pl);