diff options
author | Salome Thirot <salome.thirot@arm.com> | 2023-03-06 11:37:26 +0000 |
---|---|---|
committer | Salome Thirot <salome.thirot@arm.com> | 2023-03-07 12:04:25 +0000 |
commit | eec48083936b52bc0ec9adfc452d29b177366d75 (patch) | |
tree | a8fd7e7090a4ae2e6b371cbac59e55fc64dc7087 /vp9/encoder | |
parent | 57c6ea97522146e9471a3537304ce8a0a7a22ea0 (diff) | |
download | libvpx-eec48083936b52bc0ec9adfc452d29b177366d75.tar libvpx-eec48083936b52bc0ec9adfc452d29b177366d75.tar.gz libvpx-eec48083936b52bc0ec9adfc452d29b177366d75.tar.bz2 libvpx-eec48083936b52bc0ec9adfc452d29b177366d75.zip |
Add Neon implementation of vp9_block_error_c
Add Neon implementation of vp9_block_error_c as well as the
corresponding tests.
Change-Id: I79247b5ae24f51b7b55fc5e517d5e403dc86367a
Diffstat (limited to 'vp9/encoder')
-rw-r--r-- | vp9/encoder/arm/neon/vp9_error_neon.c | 47 |
1 files changed, 47 insertions, 0 deletions
diff --git a/vp9/encoder/arm/neon/vp9_error_neon.c b/vp9/encoder/arm/neon/vp9_error_neon.c index eb1e2e03d..0cf0bf250 100644 --- a/vp9/encoder/arm/neon/vp9_error_neon.c +++ b/vp9/encoder/arm/neon/vp9_error_neon.c @@ -15,6 +15,53 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/sum_neon.h" +int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + uint64x2_t err_u64 = vdupq_n_u64(0); + int64x2_t ssz_s64 = vdupq_n_s64(0); + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + uint32x4_t err; + int32x4_t ssz0, ssz1; + + const int16x8_t c0 = load_tran_low_to_s16q(coeff); + const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8); + const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff); + const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8); + + const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0)); + const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1)); + + // diff is 15-bits, the squares 30, so we can store 4 in 32-bits before + // accumulating them in 64-bits. + err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0)); + err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0)); + err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1)); + err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1)); + err_u64 = vpadalq_u32(err_u64, err); + + // We can't do the same here as we're operating on signed integers, so we + // can store 2 15-bit diff before accumulating into 64-bits. + ssz0 = vmull_s16(vget_low_s16(c0), vget_low_s16(c0)); + ssz0 = vmlal_s16(ssz0, vget_high_s16(c0), vget_high_s16(c0)); + ssz_s64 = vpadalq_s32(ssz_s64, ssz0); + + ssz1 = vmull_s16(vget_low_s16(c1), vget_low_s16(c1)); + ssz1 = vmlal_s16(ssz1, vget_high_s16(c1), vget_high_s16(c1)); + ssz_s64 = vpadalq_s32(ssz_s64, ssz1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; + } while (block_size != 0); + + *ssz = horizontal_add_int64x2(ssz_s64); + return (int64_t)horizontal_add_uint64x2(err_u64); +} + int64_t vp9_block_error_fp_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size) { uint64x2_t err_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; |