summaryrefslogtreecommitdiff
path: root/vpx_dsp/arm/vpx_convolve8_neon.h
diff options
context:
space:
mode:
authorLinfeng Zhang <linfengz@google.com>2017-09-05 14:48:17 -0700
committerLinfeng Zhang <linfengz@google.com>2017-09-06 15:55:17 -0700
commit3ec20445b28ceccb0a32727f81ef2659596aaf33 (patch)
tree323c9e26d16a462fd7eb838f43411aa93a00b7cb /vpx_dsp/arm/vpx_convolve8_neon.h
parentd5d2cbcc758d8b735d2aba25663914185d11cada (diff)
downloadlibvpx-3ec20445b28ceccb0a32727f81ef2659596aaf33.tar
libvpx-3ec20445b28ceccb0a32727f81ef2659596aaf33.tar.gz
libvpx-3ec20445b28ceccb0a32727f81ef2659596aaf33.tar.bz2
libvpx-3ec20445b28ceccb0a32727f81ef2659596aaf33.zip
Refactor convolve8 NEON functions
Change-Id: I4ac576875c91fee7cb150d298fae4a2c156d374c
Diffstat (limited to 'vpx_dsp/arm/vpx_convolve8_neon.h')
-rw-r--r--vpx_dsp/arm/vpx_convolve8_neon.h58
1 files changed, 58 insertions, 0 deletions
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h
new file mode 100644
index 000000000..a086d481f
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p,
+ uint8x8_t *const s0, uint8x8_t *const s1,
+ uint8x8_t *const s2, uint8x8_t *const s3,
+ uint8x8_t *const s4, uint8x8_t *const s5,
+ uint8x8_t *const s6, uint8x8_t *const s7) {
+ *s0 = vld1_u8(s);
+ s += p;
+ *s1 = vld1_u8(s);
+ s += p;
+ *s2 = vld1_u8(s);
+ s += p;
+ *s3 = vld1_u8(s);
+ s += p;
+ *s4 = vld1_u8(s);
+ s += p;
+ *s5 = vld1_u8(s);
+ s += p;
+ *s6 = vld1_u8(s);
+ s += p;
+ *s7 = vld1_u8(s);
+}
+
+static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7,
+ const int16x8_t filters,
+ const int16x8_t filter3,
+ const int16x8_t filter4) {
+ const int16x4_t filters_lo = vget_low_s16(filters);
+ const int16x4_t filters_hi = vget_high_s16(filters);
+ int16x8_t sum;
+
+ sum = vmulq_lane_s16(s0, filters_lo, 0);
+ sum = vmlaq_lane_s16(sum, s1, filters_lo, 1);
+ sum = vmlaq_lane_s16(sum, s2, filters_lo, 2);
+ sum = vmlaq_lane_s16(sum, s5, filters_hi, 1);
+ sum = vmlaq_lane_s16(sum, s6, filters_hi, 2);
+ sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
+ sum = vqaddq_s16(sum, vmulq_s16(s3, filter3));
+ sum = vqaddq_s16(sum, vmulq_s16(s4, filter4));
+ return vqrshrun_n_s16(sum, 7);
+}