add vp9_satd_neon

~60-65% faster at the function level across block sizes Change-Id: Iaf8cbe95731c43fdcbf68256e44284ba51a93893
author: James Zern <jzern@google.com> 2015-11-19 23:39:10 -0800
committer: James Zern <jzern@google.com> 2015-11-24 16:09:10 -0800
commit: eb1d0f8d60baa7e7255fa2350cd2537457f15348 (patch)
tree: 886955c76af2f3db2994a075b49c22d42853b0b2 /vp9
parent: 60760f710fd226f687fffedd6e4ab4d904ef0b48 (diff)
download: libvpx-eb1d0f8d60baa7e7255fa2350cd2537457f15348.tar
libvpx-eb1d0f8d60baa7e7255fa2350cd2537457f15348.tar.gz
libvpx-eb1d0f8d60baa7e7255fa2350cd2537457f15348.tar.bz2
libvpx-eb1d0f8d60baa7e7255fa2350cd2537457f15348.zip
2 files changed, 28 insertions, 1 deletions
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 7a2883aba..8fe6503aa 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -210,7 +210,7 @@ add_proto qw/void vp9_hadamard_16x16/, "int16_t const *src_diff, int src_stride,
 specialize qw/vp9_hadamard_16x16 sse2/;
 
 add_proto qw/int vp9_satd/, "const int16_t *coeff, int length";
-specialize qw/vp9_satd sse2/;
+specialize qw/vp9_satd sse2 neon/;
 
 add_proto qw/void vp9_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";
 specialize qw/vp9_int_pro_row sse2 neon/;
diff --git a/vp9/encoder/arm/neon/vp9_avg_neon.c b/vp9/encoder/arm/neon/vp9_avg_neon.c
index d569ec95d..5996bd426 100644
--- a/vp9/encoder/arm/neon/vp9_avg_neon.c
+++ b/vp9/encoder/arm/neon/vp9_avg_neon.c
@@ -50,6 +50,33 @@ unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) {
   return (horizontal_add_u16x8(v_sum) + 32) >> 6;
 }
 
+// coeff: 16 bits, dynamic range [-32640, 32640].
+// length: value range {16, 64, 256, 1024}.
+int vp9_satd_neon(const int16_t *coeff, int length) {
+  const int16x4_t zero = vdup_n_s16(0);
+  int32x4_t accum = vdupq_n_s32(0);
+
+  do {
+    const int16x8_t src0 = vld1q_s16(coeff);
+    const int16x8_t src8 = vld1q_s16(coeff + 8);
+    accum = vabal_s16(accum, vget_low_s16(src0), zero);
+    accum = vabal_s16(accum, vget_high_s16(src0), zero);
+    accum = vabal_s16(accum, vget_low_s16(src8), zero);
+    accum = vabal_s16(accum, vget_high_s16(src8), zero);
+    length -= 16;
+    coeff += 16;
+  } while (length != 0);
+
+  {
+    // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+    const int64x2_t s0 = vpaddlq_s32(accum);  // cascading summation of 'accum'.
+    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
+                                  vreinterpret_s32_s64(vget_high_s64(s0)));
+    const int satd = vget_lane_s32(s1, 0);
+    return satd;
+  }
+}
+
 void vp9_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
                           const int ref_stride, const int height) {
   int i;
author	James Zern <jzern@google.com>	2015-11-19 23:39:10 -0800
committer	James Zern <jzern@google.com>	2015-11-24 16:09:10 -0800
commit	eb1d0f8d60baa7e7255fa2350cd2537457f15348 (patch)
tree	886955c76af2f3db2994a075b49c22d42853b0b2 /vp9
parent	60760f710fd226f687fffedd6e4ab4d904ef0b48 (diff)
download	libvpx-eb1d0f8d60baa7e7255fa2350cd2537457f15348.tar libvpx-eb1d0f8d60baa7e7255fa2350cd2537457f15348.tar.gz libvpx-eb1d0f8d60baa7e7255fa2350cd2537457f15348.tar.bz2 libvpx-eb1d0f8d60baa7e7255fa2350cd2537457f15348.zip