diff options
author | Salome Thirot <salome.thirot@arm.com> | 2023-03-10 16:30:36 +0000 |
---|---|---|
committer | Salome Thirot <salome.thirot@arm.com> | 2023-03-22 10:50:17 +0000 |
commit | 5c7867beacb35f8f937ad03f8ca5e2f1ae9c7a6a (patch) | |
tree | 55710015d76eca92f8ef322062e653776bf45615 | |
parent | 882399bd54a82aa72ba766356d8fda31fbe40450 (diff) | |
download | libvpx-5c7867beacb35f8f937ad03f8ca5e2f1ae9c7a6a.tar libvpx-5c7867beacb35f8f937ad03f8ca5e2f1ae9c7a6a.tar.gz libvpx-5c7867beacb35f8f937ad03f8ca5e2f1ae9c7a6a.tar.bz2 libvpx-5c7867beacb35f8f937ad03f8ca5e2f1ae9c7a6a.zip |
Add Neon implementations of vpx_highbd_avg_<w>x<h>_c
Add Neon implementation of vpx_highbd_avg_4x4_c and vpx_highbd_avg_8x8_c
as well as the corresponding tests.
Change-Id: Ib1b06af5206774347690c9c56e194b76aa409c91
-rw-r--r-- | test/avg_test.cc | 7 | ||||
-rw-r--r-- | vpx_dsp/arm/highbd_avg_neon.c | 24 | ||||
-rw-r--r-- | vpx_dsp/arm/mem_neon.h | 21 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 |
4 files changed, 54 insertions, 2 deletions
diff --git a/test/avg_test.cc b/test/avg_test.cc index dd8440332..a0428304a 100644 --- a/test/avg_test.cc +++ b/test/avg_test.cc @@ -582,6 +582,13 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_sse2))); #endif // HAVE_SSE2 +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, AverageTestHBD, + ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_highbd_avg_8x8_neon), + make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_neon))); +#endif // HAVE_NEON + INSTANTIATE_TEST_SUITE_P(C, SatdHighbdTest, ::testing::Values(make_tuple(16, &vpx_satd_c), make_tuple(64, &vpx_satd_c), diff --git a/vpx_dsp/arm/highbd_avg_neon.c b/vpx_dsp/arm/highbd_avg_neon.c index b84a7875d..fc10197d7 100644 --- a/vpx_dsp/arm/highbd_avg_neon.c +++ b/vpx_dsp/arm/highbd_avg_neon.c @@ -16,6 +16,30 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/sum_neon.h" +uint32_t vpx_highbd_avg_4x4_neon(const uint8_t *a, int a_stride) { + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a); + const uint16x8_t a0 = load_unaligned_u16q(a_ptr + 0 * a_stride, a_stride); + const uint16x8_t a1 = load_unaligned_u16q(a_ptr + 2 * a_stride, a_stride); + return (horizontal_add_uint16x8(vaddq_u16(a0, a1)) + (1 << 3)) >> 4; +} + +uint32_t vpx_highbd_avg_8x8_neon(const uint8_t *a, int a_stride) { + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a); + uint16x8_t sum, a0, a1, a2, a3, a4, a5, a6, a7; + + load_u16_8x8(a_ptr, a_stride, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + sum = vaddq_u16(a0, a1); + sum = vaddq_u16(sum, a2); + sum = vaddq_u16(sum, a3); + sum = vaddq_u16(sum, a4); + sum = vaddq_u16(sum, a5); + sum = vaddq_u16(sum, a6); + sum = vaddq_u16(sum, a7); + + return (horizontal_add_uint16x8(sum) + (1 << 5)) >> 6; +} + // coeff: 32 bits, dynamic range [-2147483648, 2147483647]. // length: value range {16, 64, 256, 1024}. // satd: 42 bits, dynamic range [-2147483648 * 1024, 2147483647 * 1024] diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h index fa14f80b2..1a20da70e 100644 --- a/vpx_dsp/arm/mem_neon.h +++ b/vpx_dsp/arm/mem_neon.h @@ -419,4 +419,25 @@ static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p, vst1q_u8(s, s7); } +static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p, + uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2, + uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5, + uint16x8_t *s6, uint16x8_t *s7) { + *s0 = vld1q_u16(s); + s += p; + *s1 = vld1q_u16(s); + s += p; + *s2 = vld1q_u16(s); + s += p; + *s3 = vld1q_u16(s); + s += p; + *s4 = vld1q_u16(s); + s += p; + *s5 = vld1q_u16(s); + s += p; + *s6 = vld1q_u16(s); + s += p; + *s7 = vld1q_u16(s); +} + #endif // VPX_VPX_DSP_ARM_MEM_NEON_H_ diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 7cd3a0be8..6637186f8 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -995,10 +995,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # Avg # add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *s8, int p"; - specialize qw/vpx_highbd_avg_8x8 sse2/; + specialize qw/vpx_highbd_avg_8x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *s8, int p"; - specialize qw/vpx_highbd_avg_4x4 sse2/; + specialize qw/vpx_highbd_avg_4x4 sse2 neon/; add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max"; specialize qw/vpx_highbd_minmax_8x8 neon/; |