diff options
author | James Zern <jzern@google.com> | 2023-02-28 02:44:28 +0000 |
---|---|---|
committer | Gerrit Code Review <noreply-gerritcodereview@google.com> | 2023-02-28 02:44:28 +0000 |
commit | 372989240d57f2a585785dd52f14e815986180ea (patch) | |
tree | de69da21b274d8c1028a4490ff59e34533f0a8b7 | |
parent | c70d57c71afdf1a47b1fb0d87938e1678786c713 (diff) | |
parent | ccc101e6bb63c2af340b993c57fad0f3810aee27 (diff) | |
download | libvpx-372989240d57f2a585785dd52f14e815986180ea.tar libvpx-372989240d57f2a585785dd52f14e815986180ea.tar.gz libvpx-372989240d57f2a585785dd52f14e815986180ea.tar.bz2 libvpx-372989240d57f2a585785dd52f14e815986180ea.zip |
Merge "Add Neon implementations of standard bitdepth MSE functions" into main
-rw-r--r-- | test/variance_test.cc | 7 | ||||
-rw-r--r-- | vpx_dsp/arm/variance_neon.c | 182 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 |
3 files changed, 127 insertions, 68 deletions
diff --git a/test/variance_test.cc b/test/variance_test.cc index 33f09209f..a68cfad51 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -773,6 +773,7 @@ TEST_P(VpxSseTest, RefSse) { RefTestSse(); } TEST_P(VpxSseTest, MaxSse) { MaxTestSse(); } TEST_P(VpxMseTest, RefMse) { RefTestMse(); } TEST_P(VpxMseTest, MaxMse) { MaxTestMse(); } +TEST_P(VpxMseTest, DISABLED_Speed) { SpeedTest(); } TEST_P(VpxVarianceTest, Zero) { ZeroTest(); } TEST_P(VpxVarianceTest, Ref) { RefTest(); } TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); } @@ -1450,8 +1451,10 @@ INSTANTIATE_TEST_SUITE_P(NEON, VpxSseTest, &vpx_get4x4sse_cs_neon))); INSTANTIATE_TEST_SUITE_P(NEON, VpxMseTest, - ::testing::Values(MseParams(4, 4, - &vpx_mse16x16_neon))); + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_neon), + MseParams(4, 3, &vpx_mse16x8_neon), + MseParams(3, 4, &vpx_mse8x16_neon), + MseParams(3, 3, &vpx_mse8x8_neon))); INSTANTIATE_TEST_SUITE_P( NEON, VpxVarianceTest, diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c index 3ccc4e807..feff980c9 100644 --- a/vpx_dsp/arm/variance_neon.c +++ b/vpx_dsp/arm/variance_neon.c @@ -371,32 +371,66 @@ VARIANCE_WXH_NEON(64, 64, 12) #if defined(__ARM_FEATURE_DOTPROD) -unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride, - const unsigned char *ref_ptr, int ref_stride, - unsigned int *sse) { - int i; - uint8x16_t a[2], b[2], abs_diff[2]; - uint32x4_t sse_vec[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; - - for (i = 0; i < 8; i++) { - a[0] = vld1q_u8(src_ptr); +static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride, int h, + unsigned int *sse) { + uint32x2_t sse_u32[2] = { vdup_n_u32(0), vdup_n_u32(0) }; + + int i = h / 2; + do { + uint8x8_t s0, s1, r0, r1, diff0, diff1; + + s0 = vld1_u8(src_ptr); src_ptr += src_stride; - a[1] = vld1q_u8(src_ptr); + s1 = vld1_u8(src_ptr); src_ptr += src_stride; - b[0] = vld1q_u8(ref_ptr); + r0 = vld1_u8(ref_ptr); ref_ptr += ref_stride; - b[1] = vld1q_u8(ref_ptr); + r1 = vld1_u8(ref_ptr); ref_ptr += ref_stride; - abs_diff[0] = vabdq_u8(a[0], b[0]); - abs_diff[1] = vabdq_u8(a[1], b[1]); + diff0 = vabd_u8(s0, r0); + diff1 = vabd_u8(s1, r1); - sse_vec[0] = vdotq_u32(sse_vec[0], abs_diff[0], abs_diff[0]); - sse_vec[1] = vdotq_u32(sse_vec[1], abs_diff[1], abs_diff[1]); - } + sse_u32[0] = vdot_u32(sse_u32[0], diff0, diff0); + sse_u32[1] = vdot_u32(sse_u32[1], diff1, diff1); + } while (--i != 0); - *sse = horizontal_add_uint32x4(vaddq_u32(sse_vec[0], sse_vec[1])); - return horizontal_add_uint32x4(vaddq_u32(sse_vec[0], sse_vec[1])); + *sse = horizontal_add_uint32x2(vadd_u32(sse_u32[0], sse_u32[1])); + return *sse; +} + +static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride, int h, + unsigned int *sse) { + uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h / 2; + do { + uint8x16_t s0, s1, r0, r1, diff0, diff1; + + s0 = vld1q_u8(src_ptr); + src_ptr += src_stride; + s1 = vld1q_u8(src_ptr); + src_ptr += src_stride; + r0 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + r1 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + + diff0 = vabdq_u8(s0, r0); + diff1 = vabdq_u8(s1, r1); + + sse_u32[0] = vdotq_u32(sse_u32[0], diff0, diff0); + sse_u32[1] = vdotq_u32(sse_u32[1], diff1, diff1); + } while (--i != 0); + + *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); + return *sse; } unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, @@ -435,58 +469,67 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, #else // !defined(__ARM_FEATURE_DOTPROD) -unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride, - const unsigned char *ref_ptr, int ref_stride, - unsigned int *sse) { - int i; - uint8x16_t a[2], b[2]; - int16x4_t diff_lo[4], diff_hi[4]; - uint16x8_t diff[4]; - int32x4_t sse_vec[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), - vdupq_n_s32(0) }; +static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride, int h, + unsigned int *sse) { + uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; - for (i = 0; i < 8; i++) { - a[0] = vld1q_u8(src_ptr); + int i = h / 2; + do { + uint8x8_t s0, s1, r0, r1, diff0, diff1; + uint16x8_t sse0, sse1; + + s0 = vld1_u8(src_ptr); src_ptr += src_stride; - a[1] = vld1q_u8(src_ptr); + s1 = vld1_u8(src_ptr); src_ptr += src_stride; - b[0] = vld1q_u8(ref_ptr); + r0 = vld1_u8(ref_ptr); ref_ptr += ref_stride; - b[1] = vld1q_u8(ref_ptr); + r1 = vld1_u8(ref_ptr); ref_ptr += ref_stride; - diff[0] = vsubl_u8(vget_low_u8(a[0]), vget_low_u8(b[0])); - diff[1] = vsubl_u8(vget_high_u8(a[0]), vget_high_u8(b[0])); - diff[2] = vsubl_u8(vget_low_u8(a[1]), vget_low_u8(b[1])); - diff[3] = vsubl_u8(vget_high_u8(a[1]), vget_high_u8(b[1])); - - diff_lo[0] = vreinterpret_s16_u16(vget_low_u16(diff[0])); - diff_lo[1] = vreinterpret_s16_u16(vget_low_u16(diff[1])); - sse_vec[0] = vmlal_s16(sse_vec[0], diff_lo[0], diff_lo[0]); - sse_vec[1] = vmlal_s16(sse_vec[1], diff_lo[1], diff_lo[1]); - - diff_lo[2] = vreinterpret_s16_u16(vget_low_u16(diff[2])); - diff_lo[3] = vreinterpret_s16_u16(vget_low_u16(diff[3])); - sse_vec[2] = vmlal_s16(sse_vec[2], diff_lo[2], diff_lo[2]); - sse_vec[3] = vmlal_s16(sse_vec[3], diff_lo[3], diff_lo[3]); - - diff_hi[0] = vreinterpret_s16_u16(vget_high_u16(diff[0])); - diff_hi[1] = vreinterpret_s16_u16(vget_high_u16(diff[1])); - sse_vec[0] = vmlal_s16(sse_vec[0], diff_hi[0], diff_hi[0]); - sse_vec[1] = vmlal_s16(sse_vec[1], diff_hi[1], diff_hi[1]); - - diff_hi[2] = vreinterpret_s16_u16(vget_high_u16(diff[2])); - diff_hi[3] = vreinterpret_s16_u16(vget_high_u16(diff[3])); - sse_vec[2] = vmlal_s16(sse_vec[2], diff_hi[2], diff_hi[2]); - sse_vec[3] = vmlal_s16(sse_vec[3], diff_hi[3], diff_hi[3]); - } + diff0 = vabd_u8(s0, r0); + diff1 = vabd_u8(s1, r1); + + sse0 = vmull_u8(diff0, diff0); + sse_u32[0] = vpadalq_u16(sse_u32[0], sse0); + sse1 = vmull_u8(diff1, diff1); + sse_u32[1] = vpadalq_u16(sse_u32[1], sse1); + } while (--i != 0); + + *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); + return *sse; +} + +static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride, int h, + unsigned int *sse) { + uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + uint8x16_t s, r, diff; + uint16x8_t sse0, sse1; - sse_vec[0] = vaddq_s32(sse_vec[0], sse_vec[1]); - sse_vec[2] = vaddq_s32(sse_vec[2], sse_vec[3]); - sse_vec[0] = vaddq_s32(sse_vec[0], sse_vec[2]); + s = vld1q_u8(src_ptr); + src_ptr += src_stride; + r = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; - *sse = horizontal_add_uint32x4(vreinterpretq_u32_s32(sse_vec[0])); - return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse_vec[0])); + diff = vabdq_u8(s, r); + + sse0 = vmull_u8(vget_low_u8(diff), vget_low_u8(diff)); + sse_u32[0] = vpadalq_u16(sse_u32[0], sse0); + sse1 = vmull_u8(vget_high_u8(diff), vget_high_u8(diff)); + sse_u32[1] = vpadalq_u16(sse_u32[1], sse1); + } while (--i != 0); + + *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); + return *sse; } unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, @@ -531,3 +574,16 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, } #endif // defined(__ARM_FEATURE_DOTPROD) + +#define VPX_MSE_WXH_NEON(w, h) \ + unsigned int vpx_mse##w##x##h##_neon( \ + const unsigned char *src_ptr, int src_stride, \ + const unsigned char *ref_ptr, int ref_stride, unsigned int *sse) { \ + return vpx_mse##w##xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, h, \ + sse); \ + } + +VPX_MSE_WXH_NEON(8, 8) +VPX_MSE_WXH_NEON(8, 16) +VPX_MSE_WXH_NEON(16, 8) +VPX_MSE_WXH_NEON(16, 16) diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index eef72249e..0ad3cbe6b 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1141,13 +1141,13 @@ add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int src_stride specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi vsx lsx/; add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_mse16x8 sse2 avx2 msa mmi vsx/; + specialize qw/vpx_mse16x8 sse2 avx2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_mse8x16 sse2 msa mmi vsx/; + specialize qw/vpx_mse8x16 sse2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_mse8x8 sse2 msa mmi vsx/; + specialize qw/vpx_mse8x8 sse2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *"; specialize qw/vpx_get_mb_ss sse2 msa vsx/; |