summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJames Zern <jzern@google.com>2023-02-28 02:44:28 +0000
committerGerrit Code Review <noreply-gerritcodereview@google.com>2023-02-28 02:44:28 +0000
commit372989240d57f2a585785dd52f14e815986180ea (patch)
treede69da21b274d8c1028a4490ff59e34533f0a8b7
parentc70d57c71afdf1a47b1fb0d87938e1678786c713 (diff)
parentccc101e6bb63c2af340b993c57fad0f3810aee27 (diff)
downloadlibvpx-372989240d57f2a585785dd52f14e815986180ea.tar
libvpx-372989240d57f2a585785dd52f14e815986180ea.tar.gz
libvpx-372989240d57f2a585785dd52f14e815986180ea.tar.bz2
libvpx-372989240d57f2a585785dd52f14e815986180ea.zip
Merge "Add Neon implementations of standard bitdepth MSE functions" into main
-rw-r--r--test/variance_test.cc7
-rw-r--r--vpx_dsp/arm/variance_neon.c182
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl6
3 files changed, 127 insertions, 68 deletions
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 33f09209f..a68cfad51 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -773,6 +773,7 @@ TEST_P(VpxSseTest, RefSse) { RefTestSse(); }
TEST_P(VpxSseTest, MaxSse) { MaxTestSse(); }
TEST_P(VpxMseTest, RefMse) { RefTestMse(); }
TEST_P(VpxMseTest, MaxMse) { MaxTestMse(); }
+TEST_P(VpxMseTest, DISABLED_Speed) { SpeedTest(); }
TEST_P(VpxVarianceTest, Zero) { ZeroTest(); }
TEST_P(VpxVarianceTest, Ref) { RefTest(); }
TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); }
@@ -1450,8 +1451,10 @@ INSTANTIATE_TEST_SUITE_P(NEON, VpxSseTest,
&vpx_get4x4sse_cs_neon)));
INSTANTIATE_TEST_SUITE_P(NEON, VpxMseTest,
- ::testing::Values(MseParams(4, 4,
- &vpx_mse16x16_neon)));
+ ::testing::Values(MseParams(4, 4, &vpx_mse16x16_neon),
+ MseParams(4, 3, &vpx_mse16x8_neon),
+ MseParams(3, 4, &vpx_mse8x16_neon),
+ MseParams(3, 3, &vpx_mse8x8_neon)));
INSTANTIATE_TEST_SUITE_P(
NEON, VpxVarianceTest,
diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c
index 3ccc4e807..feff980c9 100644
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@@ -371,32 +371,66 @@ VARIANCE_WXH_NEON(64, 64, 12)
#if defined(__ARM_FEATURE_DOTPROD)
-unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride,
- const unsigned char *ref_ptr, int ref_stride,
- unsigned int *sse) {
- int i;
- uint8x16_t a[2], b[2], abs_diff[2];
- uint32x4_t sse_vec[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
-
- for (i = 0; i < 8; i++) {
- a[0] = vld1q_u8(src_ptr);
+static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ uint32x2_t sse_u32[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+ int i = h / 2;
+ do {
+ uint8x8_t s0, s1, r0, r1, diff0, diff1;
+
+ s0 = vld1_u8(src_ptr);
src_ptr += src_stride;
- a[1] = vld1q_u8(src_ptr);
+ s1 = vld1_u8(src_ptr);
src_ptr += src_stride;
- b[0] = vld1q_u8(ref_ptr);
+ r0 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
- b[1] = vld1q_u8(ref_ptr);
+ r1 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
- abs_diff[0] = vabdq_u8(a[0], b[0]);
- abs_diff[1] = vabdq_u8(a[1], b[1]);
+ diff0 = vabd_u8(s0, r0);
+ diff1 = vabd_u8(s1, r1);
- sse_vec[0] = vdotq_u32(sse_vec[0], abs_diff[0], abs_diff[0]);
- sse_vec[1] = vdotq_u32(sse_vec[1], abs_diff[1], abs_diff[1]);
- }
+ sse_u32[0] = vdot_u32(sse_u32[0], diff0, diff0);
+ sse_u32[1] = vdot_u32(sse_u32[1], diff1, diff1);
+ } while (--i != 0);
- *sse = horizontal_add_uint32x4(vaddq_u32(sse_vec[0], sse_vec[1]));
- return horizontal_add_uint32x4(vaddq_u32(sse_vec[0], sse_vec[1]));
+ *sse = horizontal_add_uint32x2(vadd_u32(sse_u32[0], sse_u32[1]));
+ return *sse;
+}
+
+static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h / 2;
+ do {
+ uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+ s0 = vld1q_u8(src_ptr);
+ src_ptr += src_stride;
+ s1 = vld1q_u8(src_ptr);
+ src_ptr += src_stride;
+ r0 = vld1q_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ r1 = vld1q_u8(ref_ptr);
+ ref_ptr += ref_stride;
+
+ diff0 = vabdq_u8(s0, r0);
+ diff1 = vabdq_u8(s1, r1);
+
+ sse_u32[0] = vdotq_u32(sse_u32[0], diff0, diff0);
+ sse_u32[1] = vdotq_u32(sse_u32[1], diff1, diff1);
+ } while (--i != 0);
+
+ *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+ return *sse;
}
unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
@@ -435,58 +469,67 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
#else // !defined(__ARM_FEATURE_DOTPROD)
-unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride,
- const unsigned char *ref_ptr, int ref_stride,
- unsigned int *sse) {
- int i;
- uint8x16_t a[2], b[2];
- int16x4_t diff_lo[4], diff_hi[4];
- uint16x8_t diff[4];
- int32x4_t sse_vec[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
- vdupq_n_s32(0) };
+static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
- for (i = 0; i < 8; i++) {
- a[0] = vld1q_u8(src_ptr);
+ int i = h / 2;
+ do {
+ uint8x8_t s0, s1, r0, r1, diff0, diff1;
+ uint16x8_t sse0, sse1;
+
+ s0 = vld1_u8(src_ptr);
src_ptr += src_stride;
- a[1] = vld1q_u8(src_ptr);
+ s1 = vld1_u8(src_ptr);
src_ptr += src_stride;
- b[0] = vld1q_u8(ref_ptr);
+ r0 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
- b[1] = vld1q_u8(ref_ptr);
+ r1 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
- diff[0] = vsubl_u8(vget_low_u8(a[0]), vget_low_u8(b[0]));
- diff[1] = vsubl_u8(vget_high_u8(a[0]), vget_high_u8(b[0]));
- diff[2] = vsubl_u8(vget_low_u8(a[1]), vget_low_u8(b[1]));
- diff[3] = vsubl_u8(vget_high_u8(a[1]), vget_high_u8(b[1]));
-
- diff_lo[0] = vreinterpret_s16_u16(vget_low_u16(diff[0]));
- diff_lo[1] = vreinterpret_s16_u16(vget_low_u16(diff[1]));
- sse_vec[0] = vmlal_s16(sse_vec[0], diff_lo[0], diff_lo[0]);
- sse_vec[1] = vmlal_s16(sse_vec[1], diff_lo[1], diff_lo[1]);
-
- diff_lo[2] = vreinterpret_s16_u16(vget_low_u16(diff[2]));
- diff_lo[3] = vreinterpret_s16_u16(vget_low_u16(diff[3]));
- sse_vec[2] = vmlal_s16(sse_vec[2], diff_lo[2], diff_lo[2]);
- sse_vec[3] = vmlal_s16(sse_vec[3], diff_lo[3], diff_lo[3]);
-
- diff_hi[0] = vreinterpret_s16_u16(vget_high_u16(diff[0]));
- diff_hi[1] = vreinterpret_s16_u16(vget_high_u16(diff[1]));
- sse_vec[0] = vmlal_s16(sse_vec[0], diff_hi[0], diff_hi[0]);
- sse_vec[1] = vmlal_s16(sse_vec[1], diff_hi[1], diff_hi[1]);
-
- diff_hi[2] = vreinterpret_s16_u16(vget_high_u16(diff[2]));
- diff_hi[3] = vreinterpret_s16_u16(vget_high_u16(diff[3]));
- sse_vec[2] = vmlal_s16(sse_vec[2], diff_hi[2], diff_hi[2]);
- sse_vec[3] = vmlal_s16(sse_vec[3], diff_hi[3], diff_hi[3]);
- }
+ diff0 = vabd_u8(s0, r0);
+ diff1 = vabd_u8(s1, r1);
+
+ sse0 = vmull_u8(diff0, diff0);
+ sse_u32[0] = vpadalq_u16(sse_u32[0], sse0);
+ sse1 = vmull_u8(diff1, diff1);
+ sse_u32[1] = vpadalq_u16(sse_u32[1], sse1);
+ } while (--i != 0);
+
+ *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+ return *sse;
+}
+
+static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr,
+ int src_stride,
+ const unsigned char *ref_ptr,
+ int ref_stride, int h,
+ unsigned int *sse) {
+ uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int i = h;
+ do {
+ uint8x16_t s, r, diff;
+ uint16x8_t sse0, sse1;
- sse_vec[0] = vaddq_s32(sse_vec[0], sse_vec[1]);
- sse_vec[2] = vaddq_s32(sse_vec[2], sse_vec[3]);
- sse_vec[0] = vaddq_s32(sse_vec[0], sse_vec[2]);
+ s = vld1q_u8(src_ptr);
+ src_ptr += src_stride;
+ r = vld1q_u8(ref_ptr);
+ ref_ptr += ref_stride;
- *sse = horizontal_add_uint32x4(vreinterpretq_u32_s32(sse_vec[0]));
- return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse_vec[0]));
+ diff = vabdq_u8(s, r);
+
+ sse0 = vmull_u8(vget_low_u8(diff), vget_low_u8(diff));
+ sse_u32[0] = vpadalq_u16(sse_u32[0], sse0);
+ sse1 = vmull_u8(vget_high_u8(diff), vget_high_u8(diff));
+ sse_u32[1] = vpadalq_u16(sse_u32[1], sse1);
+ } while (--i != 0);
+
+ *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+ return *sse;
}
unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
@@ -531,3 +574,16 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
}
#endif // defined(__ARM_FEATURE_DOTPROD)
+
+#define VPX_MSE_WXH_NEON(w, h) \
+ unsigned int vpx_mse##w##x##h##_neon( \
+ const unsigned char *src_ptr, int src_stride, \
+ const unsigned char *ref_ptr, int ref_stride, unsigned int *sse) { \
+ return vpx_mse##w##xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, h, \
+ sse); \
+ }
+
+VPX_MSE_WXH_NEON(8, 8)
+VPX_MSE_WXH_NEON(8, 16)
+VPX_MSE_WXH_NEON(16, 8)
+VPX_MSE_WXH_NEON(16, 16)
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index eef72249e..0ad3cbe6b 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1141,13 +1141,13 @@ add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int src_stride
specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi vsx lsx/;
add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_mse16x8 sse2 avx2 msa mmi vsx/;
+ specialize qw/vpx_mse16x8 sse2 avx2 neon msa mmi vsx/;
add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_mse8x16 sse2 msa mmi vsx/;
+ specialize qw/vpx_mse8x16 sse2 neon msa mmi vsx/;
add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_mse8x8 sse2 msa mmi vsx/;
+ specialize qw/vpx_mse8x8 sse2 neon msa mmi vsx/;
add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
specialize qw/vpx_get_mb_ss sse2 msa vsx/;