diff options
author | Jonathan Wright <jonathan.wright@arm.com> | 2021-05-11 13:17:44 +0100 |
---|---|---|
committer | James Zern <jzern@google.com> | 2021-05-12 14:03:05 -0700 |
commit | c8b0432505d32820af0c42a94b219aa83eed5db9 (patch) | |
tree | 2b41224e7339d3c083dc101ddfb027af0997662c | |
parent | 2db85c269bc5479e48ea7cd4fde85236ee0bc347 (diff) | |
download | libvpx-c8b0432505d32820af0c42a94b219aa83eed5db9.tar libvpx-c8b0432505d32820af0c42a94b219aa83eed5db9.tar.gz libvpx-c8b0432505d32820af0c42a94b219aa83eed5db9.tar.bz2 libvpx-c8b0432505d32820af0c42a94b219aa83eed5db9.zip |
Implement Neon variance functions using UDOT instruction
Accelerate Neon variance functions by implementing the sum of squares
calculation using the Armv8.4-A UDOT instruction instead of 4 MLAs.
The previous implementation is retained for use on CPUs that do not
implement the Armv8.4-A dot product instructions.
Bug: b/181236880
Change-Id: I9ab3d52634278b9b6f0011f39390a1195210bc75
-rw-r--r-- | vpx_dsp/arm/sum_neon.h | 27 | ||||
-rw-r--r-- | vpx_dsp/arm/variance_neon.c | 96 |
2 files changed, 123 insertions, 0 deletions
diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h index 630296237..9a7c424e8 100644 --- a/vpx_dsp/arm/sum_neon.h +++ b/vpx_dsp/arm/sum_neon.h @@ -40,6 +40,33 @@ static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) { #endif } +static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) { +#if defined(__aarch64__) + return vaddv_s32(a); +#else + return vget_lane_s32(a, 0) + vget_lane_s32(a, 1); +#endif +} + +static INLINE uint32_t horizontal_add_uint32x2(const uint32x2_t a) { +#if defined(__aarch64__) + return vaddv_u32(a); +#else + return vget_lane_u32(a, 0) + vget_lane_u32(a, 1); +#endif +} + +static INLINE int32_t horizontal_add_int32x4(const int32x4_t a) { +#if defined(__aarch64__) + return vaddvq_s32(a); +#else + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +#endif +} + static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) { #if defined(__aarch64__) return vaddvq_u32(a); diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c index e08f31f2c..19aaac7b6 100644 --- a/vpx_dsp/arm/variance_neon.c +++ b/vpx_dsp/arm/variance_neon.c @@ -19,6 +19,100 @@ #include "vpx_dsp/arm/sum_neon.h" #include "vpx_ports/mem.h" +#if defined(__ARM_FEATURE_DOTPROD) && (__ARM_FEATURE_DOTPROD == 1) + +// Process a block of width 4 four rows at a time. +static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, int h, + uint32_t *sse, int *sum) { + int i; + uint32x4_t sum_a = vdupq_n_u32(0); + uint32x4_t sum_b = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + for (i = 0; i < h; i += 4) { + const uint8x16_t a = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t b = load_unaligned_u8q(ref_ptr, ref_stride); + + const uint8x16_t abs_diff = vabdq_u8(a, b); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + sum_a = vdotq_u32(sum_a, a, vdupq_n_u8(1)); + sum_b = vdotq_u32(sum_b, b, vdupq_n_u8(1)); + + src_ptr += 4 * src_stride; + ref_ptr += 4 * ref_stride; + } + + *sum = horizontal_add_int32x4(vreinterpretq_s32_u32(vsubq_u32(sum_a, sum_b))); + *sse = horizontal_add_uint32x4(sse_u32); +} + +// Process a block of any size where the width is divisible by 16. +static void variance_neon_w16(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, int w, + int h, uint32_t *sse, int *sum) { + int i, j; + uint32x4_t sum_a = vdupq_n_u32(0); + uint32x4_t sum_b = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 16) { + const uint8x16_t a = vld1q_u8(src_ptr + j); + const uint8x16_t b = vld1q_u8(ref_ptr + j); + + const uint8x16_t abs_diff = vabdq_u8(a, b); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + sum_a = vdotq_u32(sum_a, a, vdupq_n_u8(1)); + sum_b = vdotq_u32(sum_b, b, vdupq_n_u8(1)); + } + src_ptr += src_stride; + ref_ptr += ref_stride; + } + + *sum = horizontal_add_int32x4(vreinterpretq_s32_u32(vsubq_u32(sum_a, sum_b))); + *sse = horizontal_add_uint32x4(sse_u32); +} + +// Process a block of width 8 two rows at a time. +static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, int h, + uint32_t *sse, int *sum) { + int i = 0; + uint32x2_t sum_a = vdup_n_u32(0); + uint32x2_t sum_b = vdup_n_u32(0); + uint32x2_t sse_lo_u32 = vdup_n_u32(0); + uint32x2_t sse_hi_u32 = vdup_n_u32(0); + + do { + const uint8x8_t a_0 = vld1_u8(src_ptr); + const uint8x8_t a_1 = vld1_u8(src_ptr + src_stride); + const uint8x8_t b_0 = vld1_u8(ref_ptr); + const uint8x8_t b_1 = vld1_u8(ref_ptr + ref_stride); + + const uint8x8_t abs_diff_0 = vabd_u8(a_0, b_0); + const uint8x8_t abs_diff_1 = vabd_u8(a_1, b_1); + sse_lo_u32 = vdot_u32(sse_lo_u32, abs_diff_0, abs_diff_0); + sse_hi_u32 = vdot_u32(sse_hi_u32, abs_diff_1, abs_diff_1); + + sum_a = vdot_u32(sum_a, a_0, vdup_n_u8(1)); + sum_b = vdot_u32(sum_b, b_0, vdup_n_u8(1)); + sum_a = vdot_u32(sum_a, a_1, vdup_n_u8(1)); + sum_b = vdot_u32(sum_b, b_1, vdup_n_u8(1)); + + src_ptr += src_stride + src_stride; + ref_ptr += ref_stride + ref_stride; + i += 2; + } while (i < h); + + *sum = horizontal_add_int32x2(vreinterpret_s32_u32(vsub_u32(sum_a, sum_b))); + *sse = horizontal_add_uint32x2(vadd_u32(sse_lo_u32, sse_hi_u32)); +} + +#else + // The variance helper functions use int16_t for sum. 8 values are accumulated // and then added (at which point they expand up to int32_t). To avoid overflow, // there can be no more than 32767 / 255 ~= 128 values accumulated in each @@ -160,6 +254,8 @@ static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride, vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32))); } +#endif + void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum) { |