summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSalome Thirot <salome.thirot@arm.com>2023-03-06 11:37:26 +0000
committerSalome Thirot <salome.thirot@arm.com>2023-03-07 12:04:25 +0000
commiteec48083936b52bc0ec9adfc452d29b177366d75 (patch)
treea8fd7e7090a4ae2e6b371cbac59e55fc64dc7087
parent57c6ea97522146e9471a3537304ce8a0a7a22ea0 (diff)
downloadlibvpx-eec48083936b52bc0ec9adfc452d29b177366d75.tar
libvpx-eec48083936b52bc0ec9adfc452d29b177366d75.tar.gz
libvpx-eec48083936b52bc0ec9adfc452d29b177366d75.tar.bz2
libvpx-eec48083936b52bc0ec9adfc452d29b177366d75.zip
Add Neon implementation of vp9_block_error_c
Add Neon implementation of vp9_block_error_c as well as the corresponding tests. Change-Id: I79247b5ae24f51b7b55fc5e517d5e403dc86367a
-rw-r--r--test/test.mk2
-rw-r--r--test/vp9_block_error_test.cc8
-rw-r--r--vp9/common/vp9_rtcd_defs.pl4
-rw-r--r--vp9/encoder/arm/neon/vp9_error_neon.c47
4 files changed, 58 insertions, 3 deletions
diff --git a/test/test.mk b/test/test.mk
index f60d8f823..3c225bc75 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -179,7 +179,7 @@ ifneq ($(CONFIG_REALTIME_ONLY),yes)
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += yuv_temporal_filter_test.cc
endif
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
-ifneq (, $(filter yes, $(HAVE_SSE2) $(HAVE_AVX2)))
+ifneq (, $(filter yes, $(HAVE_SSE2) $(HAVE_AVX2) $(HAVE_NEON)))
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_block_error_test.cc
endif
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
diff --git a/test/vp9_block_error_test.cc b/test/vp9_block_error_test.cc
index b93b014e6..bde84cd61 100644
--- a/test/vp9_block_error_test.cc
+++ b/test/vp9_block_error_test.cc
@@ -197,4 +197,12 @@ INSTANTIATE_TEST_SUITE_P(
&BlockError8BitWrapper<vp9_block_error_c>,
VPX_BITS_8)));
#endif // HAVE_AVX2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+ NEON, BlockErrorTest,
+ ::testing::Values(make_tuple(&BlockError8BitWrapper<vp9_block_error_neon>,
+ &BlockError8BitWrapper<vp9_block_error_c>,
+ VPX_BITS_8)));
+#endif
} // namespace
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index c939411a3..2f9870dd4 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -136,12 +136,12 @@ add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t
specialize qw/vp9_quantize_fp_32x32 neon ssse3 avx2 vsx/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
- specialize qw/vp9_block_error avx2 sse2/;
+ specialize qw/vp9_block_error neon avx2 sse2/;
add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
specialize qw/vp9_highbd_block_error sse2/;
} else {
- specialize qw/vp9_block_error avx2 msa sse2/;
+ specialize qw/vp9_block_error neon avx2 msa sse2/;
}
# fdct functions
diff --git a/vp9/encoder/arm/neon/vp9_error_neon.c b/vp9/encoder/arm/neon/vp9_error_neon.c
index eb1e2e03d..0cf0bf250 100644
--- a/vp9/encoder/arm/neon/vp9_error_neon.c
+++ b/vp9/encoder/arm/neon/vp9_error_neon.c
@@ -15,6 +15,53 @@
#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/sum_neon.h"
+int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz) {
+ uint64x2_t err_u64 = vdupq_n_u64(0);
+ int64x2_t ssz_s64 = vdupq_n_s64(0);
+
+ assert(block_size >= 16);
+ assert((block_size % 16) == 0);
+
+ do {
+ uint32x4_t err;
+ int32x4_t ssz0, ssz1;
+
+ const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+ const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+ const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+ const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+ const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0));
+ const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1));
+
+ // diff is 15-bits, the squares 30, so we can store 4 in 32-bits before
+ // accumulating them in 64-bits.
+ err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0));
+ err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0));
+ err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1));
+ err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1));
+ err_u64 = vpadalq_u32(err_u64, err);
+
+ // We can't do the same here as we're operating on signed integers, so we
+ // can store 2 15-bit diff before accumulating into 64-bits.
+ ssz0 = vmull_s16(vget_low_s16(c0), vget_low_s16(c0));
+ ssz0 = vmlal_s16(ssz0, vget_high_s16(c0), vget_high_s16(c0));
+ ssz_s64 = vpadalq_s32(ssz_s64, ssz0);
+
+ ssz1 = vmull_s16(vget_low_s16(c1), vget_low_s16(c1));
+ ssz1 = vmlal_s16(ssz1, vget_high_s16(c1), vget_high_s16(c1));
+ ssz_s64 = vpadalq_s32(ssz_s64, ssz1);
+
+ coeff += 16;
+ dqcoeff += 16;
+ block_size -= 16;
+ } while (block_size != 0);
+
+ *ssz = horizontal_add_int64x2(ssz_s64);
+ return (int64_t)horizontal_add_uint64x2(err_u64);
+}
+
int64_t vp9_block_error_fp_neon(const tran_low_t *coeff,
const tran_low_t *dqcoeff, int block_size) {
uint64x2_t err_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };