summaryrefslogtreecommitdiff
path: root/vpx_dsp/arm
diff options
context:
space:
mode:
authorJohann <johannkoenig@google.com>2016-04-14 14:26:32 -0700
committerJohann <johannkoenig@google.com>2016-04-21 21:40:25 -0700
commit2f5840de3ec53cef99b30bd5eb1877f92f15a80f (patch)
tree39ce9b141f7fd5ec315ba7e35adf202edd925859 /vpx_dsp/arm
parent1710419eb5a41d4a16fdf48ca47173832a585482 (diff)
downloadlibvpx-2f5840de3ec53cef99b30bd5eb1877f92f15a80f.tar
libvpx-2f5840de3ec53cef99b30bd5eb1877f92f15a80f.tar.gz
libvpx-2f5840de3ec53cef99b30bd5eb1877f92f15a80f.tar.bz2
libvpx-2f5840de3ec53cef99b30bd5eb1877f92f15a80f.zip
vpx_minmax_8x8_neon and test
BUG=https://bugs.chromium.org/p/webm/issues/detail?id=1156 Change-Id: Ief0ad8d6255b0ef0f233cda153799e3c72d3dbc6
Diffstat (limited to 'vpx_dsp/arm')
-rw-r--r--vpx_dsp/arm/avg_neon.c57
1 files changed, 57 insertions, 0 deletions
diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c
index d054c4185..e52958c54 100644
--- a/vpx_dsp/arm/avg_neon.c
+++ b/vpx_dsp/arm/avg_neon.c
@@ -197,3 +197,60 @@ int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
return s - ((t * t) >> shift_factor);
}
}
+
+void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ int *min, int *max) {
+ // Load and concatenate.
+ const uint8x16_t a01 = vcombine_u8(vld1_u8(a),
+ vld1_u8(a + a_stride));
+ const uint8x16_t a23 = vcombine_u8(vld1_u8(a + 2 * a_stride),
+ vld1_u8(a + 3 * a_stride));
+ const uint8x16_t a45 = vcombine_u8(vld1_u8(a + 4 * a_stride),
+ vld1_u8(a + 5 * a_stride));
+ const uint8x16_t a67 = vcombine_u8(vld1_u8(a + 6 * a_stride),
+ vld1_u8(a + 7 * a_stride));
+
+ const uint8x16_t b01 = vcombine_u8(vld1_u8(b),
+ vld1_u8(b + b_stride));
+ const uint8x16_t b23 = vcombine_u8(vld1_u8(b + 2 * b_stride),
+ vld1_u8(b + 3 * b_stride));
+ const uint8x16_t b45 = vcombine_u8(vld1_u8(b + 4 * b_stride),
+ vld1_u8(b + 5 * b_stride));
+ const uint8x16_t b67 = vcombine_u8(vld1_u8(b + 6 * b_stride),
+ vld1_u8(b + 7 * b_stride));
+
+ // Absolute difference.
+ const uint8x16_t ab01_diff = vabdq_u8(a01, b01);
+ const uint8x16_t ab23_diff = vabdq_u8(a23, b23);
+ const uint8x16_t ab45_diff = vabdq_u8(a45, b45);
+ const uint8x16_t ab67_diff = vabdq_u8(a67, b67);
+
+ // Max values between the Q vectors.
+ const uint8x16_t ab0123_max = vmaxq_u8(ab01_diff, ab23_diff);
+ const uint8x16_t ab4567_max = vmaxq_u8(ab45_diff, ab67_diff);
+ const uint8x16_t ab0123_min = vminq_u8(ab01_diff, ab23_diff);
+ const uint8x16_t ab4567_min = vminq_u8(ab45_diff, ab67_diff);
+
+ const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max);
+ const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min);
+
+ // Split to D and start doing pairwise.
+ uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max));
+ uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min));
+
+ // Enough runs of vpmax/min propogate the max/min values to every position.
+ ab_max = vpmax_u8(ab_max, ab_max);
+ ab_min = vpmin_u8(ab_min, ab_min);
+
+ ab_max = vpmax_u8(ab_max, ab_max);
+ ab_min = vpmin_u8(ab_min, ab_min);
+
+ ab_max = vpmax_u8(ab_max, ab_max);
+ ab_min = vpmin_u8(ab_min, ab_min);
+
+ *min = *max = 0; // Clear high bits
+ // Store directly to avoid costly neon->gpr transfer.
+ vst1_lane_u8((uint8_t *)max, ab_max, 0);
+ vst1_lane_u8((uint8_t *)min, ab_min, 0);
+}