summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJonathan Wright <jonathan.wright@arm.com>2023-05-30 15:22:04 +0100
committerJonathan Wright <jonathan.wright@arm.com>2023-05-31 14:30:02 +0100
commitc738e87f27ef8e12dd28b9052f446a5f69abf3c9 (patch)
treec24856549456db876b489192cd3b47b7e43c4272
parent99522d307ccef8b53d373beab8c5b6bf997ca4ef (diff)
downloadlibvpx-c738e87f27ef8e12dd28b9052f446a5f69abf3c9.tar
libvpx-c738e87f27ef8e12dd28b9052f446a5f69abf3c9.tar.gz
libvpx-c738e87f27ef8e12dd28b9052f446a5f69abf3c9.tar.bz2
libvpx-c738e87f27ef8e12dd28b9052f446a5f69abf3c9.zip
Optimize Neon implementation of vpx_int_pro_col
Use widening pairwise addition instructions to halve the number of additions required. Change-Id: I0307a3b65e50d2b1ae582938bc5df9c2b21df734
-rw-r--r--vpx_dsp/arm/avg_neon.c14
1 files changed, 7 insertions, 7 deletions
diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c
index 8c61fc26f..2fe65d112 100644
--- a/vpx_dsp/arm/avg_neon.c
+++ b/vpx_dsp/arm/avg_neon.c
@@ -121,17 +121,17 @@ void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
}
int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
+ uint16x8_t sum;
int i;
- uint16x8_t vec_sum = vdupq_n_u16(0);
- for (i = 0; i < width; i += 16) {
- const uint8x16_t vec_row = vld1q_u8(ref);
- vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));
- vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));
- ref += 16;
+ assert(width >= 16 && width % 16 == 0);
+
+ sum = vpaddlq_u8(vld1q_u8(ref));
+ for (i = 16; i < width; i += 16) {
+ sum = vpadalq_u8(sum, vld1q_u8(ref + i));
}
- return (int16_t)horizontal_add_uint16x8(vec_sum);
+ return (int16_t)horizontal_add_uint16x8(sum);
}
// ref, src = [0, 510] - max diff = 16-bits