diff options
author | Johann <johannkoenig@google.com> | 2016-04-14 14:26:32 -0700 |
---|---|---|
committer | Johann <johannkoenig@google.com> | 2016-04-21 21:40:25 -0700 |
commit | 2f5840de3ec53cef99b30bd5eb1877f92f15a80f (patch) | |
tree | 39ce9b141f7fd5ec315ba7e35adf202edd925859 /vpx_dsp/arm | |
parent | 1710419eb5a41d4a16fdf48ca47173832a585482 (diff) | |
download | libvpx-2f5840de3ec53cef99b30bd5eb1877f92f15a80f.tar libvpx-2f5840de3ec53cef99b30bd5eb1877f92f15a80f.tar.gz libvpx-2f5840de3ec53cef99b30bd5eb1877f92f15a80f.tar.bz2 libvpx-2f5840de3ec53cef99b30bd5eb1877f92f15a80f.zip |
vpx_minmax_8x8_neon and test
BUG=https://bugs.chromium.org/p/webm/issues/detail?id=1156
Change-Id: Ief0ad8d6255b0ef0f233cda153799e3c72d3dbc6
Diffstat (limited to 'vpx_dsp/arm')
-rw-r--r-- | vpx_dsp/arm/avg_neon.c | 57 |
1 files changed, 57 insertions, 0 deletions
diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c index d054c4185..e52958c54 100644 --- a/vpx_dsp/arm/avg_neon.c +++ b/vpx_dsp/arm/avg_neon.c @@ -197,3 +197,60 @@ int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) { return s - ((t * t) >> shift_factor); } } + +void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int *min, int *max) { + // Load and concatenate. + const uint8x16_t a01 = vcombine_u8(vld1_u8(a), + vld1_u8(a + a_stride)); + const uint8x16_t a23 = vcombine_u8(vld1_u8(a + 2 * a_stride), + vld1_u8(a + 3 * a_stride)); + const uint8x16_t a45 = vcombine_u8(vld1_u8(a + 4 * a_stride), + vld1_u8(a + 5 * a_stride)); + const uint8x16_t a67 = vcombine_u8(vld1_u8(a + 6 * a_stride), + vld1_u8(a + 7 * a_stride)); + + const uint8x16_t b01 = vcombine_u8(vld1_u8(b), + vld1_u8(b + b_stride)); + const uint8x16_t b23 = vcombine_u8(vld1_u8(b + 2 * b_stride), + vld1_u8(b + 3 * b_stride)); + const uint8x16_t b45 = vcombine_u8(vld1_u8(b + 4 * b_stride), + vld1_u8(b + 5 * b_stride)); + const uint8x16_t b67 = vcombine_u8(vld1_u8(b + 6 * b_stride), + vld1_u8(b + 7 * b_stride)); + + // Absolute difference. + const uint8x16_t ab01_diff = vabdq_u8(a01, b01); + const uint8x16_t ab23_diff = vabdq_u8(a23, b23); + const uint8x16_t ab45_diff = vabdq_u8(a45, b45); + const uint8x16_t ab67_diff = vabdq_u8(a67, b67); + + // Max values between the Q vectors. + const uint8x16_t ab0123_max = vmaxq_u8(ab01_diff, ab23_diff); + const uint8x16_t ab4567_max = vmaxq_u8(ab45_diff, ab67_diff); + const uint8x16_t ab0123_min = vminq_u8(ab01_diff, ab23_diff); + const uint8x16_t ab4567_min = vminq_u8(ab45_diff, ab67_diff); + + const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max); + const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min); + + // Split to D and start doing pairwise. + uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max)); + uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min)); + + // Enough runs of vpmax/min propogate the max/min values to every position. + ab_max = vpmax_u8(ab_max, ab_max); + ab_min = vpmin_u8(ab_min, ab_min); + + ab_max = vpmax_u8(ab_max, ab_max); + ab_min = vpmin_u8(ab_min, ab_min); + + ab_max = vpmax_u8(ab_max, ab_max); + ab_min = vpmin_u8(ab_min, ab_min); + + *min = *max = 0; // Clear high bits + // Store directly to avoid costly neon->gpr transfer. + vst1_lane_u8((uint8_t *)max, ab_max, 0); + vst1_lane_u8((uint8_t *)min, ab_min, 0); +} |