diff options
author | James Zern <jzern@google.com> | 2023-03-01 23:13:34 +0000 |
---|---|---|
committer | Gerrit Code Review <noreply-gerritcodereview@google.com> | 2023-03-01 23:13:34 +0000 |
commit | 0e7804ca30c367eefb17594a0c5096f2f26de732 (patch) | |
tree | 787f78105faa0da291ac3fe3fe23197dd475625e | |
parent | 7478b7e4e481562a4a13f233acb66a60462e1934 (diff) | |
parent | 096cd0ba8ab2126682a9f6f01d6c8c0084d2f8ab (diff) | |
download | libvpx-0e7804ca30c367eefb17594a0c5096f2f26de732.tar libvpx-0e7804ca30c367eefb17594a0c5096f2f26de732.tar.gz libvpx-0e7804ca30c367eefb17594a0c5096f2f26de732.tar.bz2 libvpx-0e7804ca30c367eefb17594a0c5096f2f26de732.zip |
Merge "Optimize Neon implementation of high bitdepth MSE functions" into main
-rw-r--r-- | test/variance_test.cc | 16 | ||||
-rw-r--r-- | vpx_dsp/arm/highbd_variance_neon.c | 197 |
2 files changed, 169 insertions, 44 deletions
diff --git a/test/variance_test.cc b/test/variance_test.cc index a68cfad51..1359bc4ba 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -1508,6 +1508,22 @@ INSTANTIATE_TEST_SUITE_P( #if CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_SUITE_P( + NEON, VpxHBDMseTest, + ::testing::Values( + MseParams(4, 4, &vpx_highbd_12_mse16x16_neon, VPX_BITS_12), + MseParams(4, 3, &vpx_highbd_12_mse16x8_neon, VPX_BITS_12), + MseParams(3, 4, &vpx_highbd_12_mse8x16_neon, VPX_BITS_12), + MseParams(3, 3, &vpx_highbd_12_mse8x8_neon, VPX_BITS_12), + MseParams(4, 4, &vpx_highbd_10_mse16x16_neon, VPX_BITS_10), + MseParams(4, 3, &vpx_highbd_10_mse16x8_neon, VPX_BITS_10), + MseParams(3, 4, &vpx_highbd_10_mse8x16_neon, VPX_BITS_10), + MseParams(3, 3, &vpx_highbd_10_mse8x8_neon, VPX_BITS_10), + MseParams(4, 4, &vpx_highbd_8_mse16x16_neon, VPX_BITS_8), + MseParams(4, 3, &vpx_highbd_8_mse16x8_neon, VPX_BITS_8), + MseParams(3, 4, &vpx_highbd_8_mse8x16_neon, VPX_BITS_8), + MseParams(3, 3, &vpx_highbd_8_mse8x8_neon, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P( NEON, VpxHBDVarianceTest, ::testing::Values( VarianceParams(6, 6, &vpx_highbd_12_variance64x64_neon, 12), diff --git a/vpx_dsp/arm/highbd_variance_neon.c b/vpx_dsp/arm/highbd_variance_neon.c index 89bd5c579..d0b366c95 100644 --- a/vpx_dsp/arm/highbd_variance_neon.c +++ b/vpx_dsp/arm/highbd_variance_neon.c @@ -351,50 +351,159 @@ HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 64) *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ } -#define HIGHBD_MSE(w, h) \ - uint32_t vpx_highbd_8_mse##w##x##h##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - uint64_t sse_long = 0; \ - int64_t sum_long = 0; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ - highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \ - &sse_long, &sum_long); \ - *sse = (uint32_t)sse_long; \ - return *sse; \ - } \ - \ - uint32_t vpx_highbd_10_mse##w##x##h##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - uint64_t sse_long = 0; \ - int64_t sum_long = 0; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ - highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \ - &sse_long, &sum_long); \ - *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \ - return *sse; \ - } \ - \ - uint32_t vpx_highbd_12_mse##w##x##h##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - uint64_t sse_long = 0; \ - int64_t sum_long = 0; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ - highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \ - &sse_long, &sum_long); \ - *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ - return *sse; \ - } - HIGHBD_GET_VAR(8) HIGHBD_GET_VAR(16) -HIGHBD_MSE(16, 16) -HIGHBD_MSE(16, 8) -HIGHBD_MSE(8, 16) -HIGHBD_MSE(8, 8) +static INLINE uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int w, int h, + unsigned int *sse) { + uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint16x8_t s = vld1q_u16(src_ptr + j); + uint16x8_t r = vld1q_u16(ref_ptr + j); + + uint16x8_t diff = vabdq_u16(s, r); + + sse_u32[0] = + vmlal_u16(sse_u32[0], vget_low_u16(diff), vget_low_u16(diff)); + sse_u32[1] = + vmlal_u16(sse_u32[1], vget_high_u16(diff), vget_high_u16(diff)); + + j += 8; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sse = horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); + return *sse; +} + +#if defined(__ARM_FEATURE_DOTPROD) + +static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, + unsigned int *sse) { + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h / 2; + do { + uint16x8_t s0, s1, r0, r1; + uint8x16_t s, r, diff; + + s0 = vld1q_u16(src_ptr); + src_ptr += src_stride; + s1 = vld1q_u16(src_ptr); + src_ptr += src_stride; + r0 = vld1q_u16(ref_ptr); + ref_ptr += ref_stride; + r1 = vld1q_u16(ref_ptr); + ref_ptr += ref_stride; + + s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1)); + r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1)); + + diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, diff, diff); + } while (--i != 0); + + *sse = horizontal_add_uint32x4(sse_u32); + return *sse; +} + +static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, + unsigned int *sse) { + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + uint16x8_t s0, s1, r0, r1; + uint8x16_t s, r, diff; + + s0 = vld1q_u16(src_ptr); + s1 = vld1q_u16(src_ptr + 8); + r0 = vld1q_u16(ref_ptr); + r1 = vld1q_u16(ref_ptr + 8); + + s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1)); + r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1)); + + diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, diff, diff); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sse = horizontal_add_uint32x4(sse_u32); + return *sse; +} + +#else // !defined(__ARM_FEATURE_DOTPROD) + +static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, + unsigned int *sse) { + return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, h, + sse); +} + +static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, + unsigned int *sse) { + return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, h, + sse); +} + +#endif // defined(__ARM_FEATURE_DOTPROD) + +#define HIGHBD_MSE_WXH_NEON(w, h) \ + uint32_t vpx_highbd_8_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_mse8_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_10_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \ + *sse = ROUND_POWER_OF_TWO(*sse, 4); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_12_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \ + *sse = ROUND_POWER_OF_TWO(*sse, 8); \ + return *sse; \ + } + +HIGHBD_MSE_WXH_NEON(16, 16) +HIGHBD_MSE_WXH_NEON(16, 8) +HIGHBD_MSE_WXH_NEON(8, 16) +HIGHBD_MSE_WXH_NEON(8, 8) |