diff options
author | Jonathan Wright <jonathan.wright@arm.com> | 2023-05-30 15:22:04 +0100 |
---|---|---|
committer | Jonathan Wright <jonathan.wright@arm.com> | 2023-05-31 14:30:02 +0100 |
commit | c738e87f27ef8e12dd28b9052f446a5f69abf3c9 (patch) | |
tree | c24856549456db876b489192cd3b47b7e43c4272 | |
parent | 99522d307ccef8b53d373beab8c5b6bf997ca4ef (diff) | |
download | libvpx-c738e87f27ef8e12dd28b9052f446a5f69abf3c9.tar libvpx-c738e87f27ef8e12dd28b9052f446a5f69abf3c9.tar.gz libvpx-c738e87f27ef8e12dd28b9052f446a5f69abf3c9.tar.bz2 libvpx-c738e87f27ef8e12dd28b9052f446a5f69abf3c9.zip |
Optimize Neon implementation of vpx_int_pro_col
Use widening pairwise addition instructions to halve the number of
additions required.
Change-Id: I0307a3b65e50d2b1ae582938bc5df9c2b21df734
-rw-r--r-- | vpx_dsp/arm/avg_neon.c | 14 |
1 files changed, 7 insertions, 7 deletions
diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c index 8c61fc26f..2fe65d112 100644 --- a/vpx_dsp/arm/avg_neon.c +++ b/vpx_dsp/arm/avg_neon.c @@ -121,17 +121,17 @@ void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, } int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) { + uint16x8_t sum; int i; - uint16x8_t vec_sum = vdupq_n_u16(0); - for (i = 0; i < width; i += 16) { - const uint8x16_t vec_row = vld1q_u8(ref); - vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row)); - vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row)); - ref += 16; + assert(width >= 16 && width % 16 == 0); + + sum = vpaddlq_u8(vld1q_u8(ref)); + for (i = 16; i < width; i += 16) { + sum = vpadalq_u8(sum, vld1q_u8(ref + i)); } - return (int16_t)horizontal_add_uint16x8(vec_sum); + return (int16_t)horizontal_add_uint16x8(sum); } // ref, src = [0, 510] - max diff = 16-bits |