diff options
author | Salome Thirot <salome.thirot@arm.com> | 2023-03-06 11:37:26 +0000 |
---|---|---|
committer | Salome Thirot <salome.thirot@arm.com> | 2023-03-07 12:04:25 +0000 |
commit | eec48083936b52bc0ec9adfc452d29b177366d75 (patch) | |
tree | a8fd7e7090a4ae2e6b371cbac59e55fc64dc7087 | |
parent | 57c6ea97522146e9471a3537304ce8a0a7a22ea0 (diff) | |
download | libvpx-eec48083936b52bc0ec9adfc452d29b177366d75.tar libvpx-eec48083936b52bc0ec9adfc452d29b177366d75.tar.gz libvpx-eec48083936b52bc0ec9adfc452d29b177366d75.tar.bz2 libvpx-eec48083936b52bc0ec9adfc452d29b177366d75.zip |
Add Neon implementation of vp9_block_error_c
Add Neon implementation of vp9_block_error_c as well as the
corresponding tests.
Change-Id: I79247b5ae24f51b7b55fc5e517d5e403dc86367a
-rw-r--r-- | test/test.mk | 2 | ||||
-rw-r--r-- | test/vp9_block_error_test.cc | 8 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.pl | 4 | ||||
-rw-r--r-- | vp9/encoder/arm/neon/vp9_error_neon.c | 47 |
4 files changed, 58 insertions, 3 deletions
diff --git a/test/test.mk b/test/test.mk index f60d8f823..3c225bc75 100644 --- a/test/test.mk +++ b/test/test.mk @@ -179,7 +179,7 @@ ifneq ($(CONFIG_REALTIME_ONLY),yes) LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += yuv_temporal_filter_test.cc endif LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc -ifneq (, $(filter yes, $(HAVE_SSE2) $(HAVE_AVX2))) +ifneq (, $(filter yes, $(HAVE_SSE2) $(HAVE_AVX2) $(HAVE_NEON))) LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_block_error_test.cc endif LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc diff --git a/test/vp9_block_error_test.cc b/test/vp9_block_error_test.cc index b93b014e6..bde84cd61 100644 --- a/test/vp9_block_error_test.cc +++ b/test/vp9_block_error_test.cc @@ -197,4 +197,12 @@ INSTANTIATE_TEST_SUITE_P( &BlockError8BitWrapper<vp9_block_error_c>, VPX_BITS_8))); #endif // HAVE_AVX2 + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, BlockErrorTest, + ::testing::Values(make_tuple(&BlockError8BitWrapper<vp9_block_error_neon>, + &BlockError8BitWrapper<vp9_block_error_c>, + VPX_BITS_8))); +#endif } // namespace diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index c939411a3..2f9870dd4 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -136,12 +136,12 @@ add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t specialize qw/vp9_quantize_fp_32x32 neon ssse3 avx2 vsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - specialize qw/vp9_block_error avx2 sse2/; + specialize qw/vp9_block_error neon avx2 sse2/; add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd"; specialize qw/vp9_highbd_block_error sse2/; } else { - specialize qw/vp9_block_error avx2 msa sse2/; + specialize qw/vp9_block_error neon avx2 msa sse2/; } # fdct functions diff --git a/vp9/encoder/arm/neon/vp9_error_neon.c b/vp9/encoder/arm/neon/vp9_error_neon.c index eb1e2e03d..0cf0bf250 100644 --- a/vp9/encoder/arm/neon/vp9_error_neon.c +++ b/vp9/encoder/arm/neon/vp9_error_neon.c @@ -15,6 +15,53 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/sum_neon.h" +int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + uint64x2_t err_u64 = vdupq_n_u64(0); + int64x2_t ssz_s64 = vdupq_n_s64(0); + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + uint32x4_t err; + int32x4_t ssz0, ssz1; + + const int16x8_t c0 = load_tran_low_to_s16q(coeff); + const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8); + const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff); + const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8); + + const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0)); + const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1)); + + // diff is 15-bits, the squares 30, so we can store 4 in 32-bits before + // accumulating them in 64-bits. + err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0)); + err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0)); + err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1)); + err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1)); + err_u64 = vpadalq_u32(err_u64, err); + + // We can't do the same here as we're operating on signed integers, so we + // can store 2 15-bit diff before accumulating into 64-bits. + ssz0 = vmull_s16(vget_low_s16(c0), vget_low_s16(c0)); + ssz0 = vmlal_s16(ssz0, vget_high_s16(c0), vget_high_s16(c0)); + ssz_s64 = vpadalq_s32(ssz_s64, ssz0); + + ssz1 = vmull_s16(vget_low_s16(c1), vget_low_s16(c1)); + ssz1 = vmlal_s16(ssz1, vget_high_s16(c1), vget_high_s16(c1)); + ssz_s64 = vpadalq_s32(ssz_s64, ssz1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; + } while (block_size != 0); + + *ssz = horizontal_add_int64x2(ssz_s64); + return (int64_t)horizontal_add_uint64x2(err_u64); +} + int64_t vp9_block_error_fp_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size) { uint64x2_t err_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; |