summaryrefslogtreecommitdiff
path: root/vp8
diff options
context:
space:
mode:
authorJames Yu <james.yu@linaro.org>2013-12-17 22:03:09 +0800
committerGerrit Code Review <gerrit@gerrit.golo.chromium.org>2014-05-03 04:54:39 -0700
commitfeaf766bd04fc2d0a75d322f41785a1ecf076b30 (patch)
treea67a7b109cdcbe9847bd418579f2a47278d4196c /vp8
parent4a8336fa9da0e338f22012cb603be314c1414264 (diff)
downloadlibvpx-feaf766bd04fc2d0a75d322f41785a1ecf076b30.tar
libvpx-feaf766bd04fc2d0a75d322f41785a1ecf076b30.tar.gz
libvpx-feaf766bd04fc2d0a75d322f41785a1ecf076b30.tar.bz2
libvpx-feaf766bd04fc2d0a75d322f41785a1ecf076b30.zip
VP8 for ARMv8 by using NEON intrinsics 12
Add sad_neon.c - vp8_sad16x16_neon - vp8_sad16x8_neon - vp8_sad8x8_neon - vp8_sad8x16_neon - vp8_sad4x4_neon Change-Id: I08eaae49ec03fb91b394354660a5df0367cea311 Signed-off-by: James Yu <james.yu@linaro.org>
Diffstat (limited to 'vp8')
-rw-r--r--vp8/common/arm/neon/sad16_neon.asm212
-rw-r--r--vp8/common/arm/neon/sad8_neon.asm215
-rw-r--r--vp8/common/arm/neon/sad_neon.c184
-rw-r--r--vp8/vp8_common.mk3
4 files changed, 185 insertions, 429 deletions
diff --git a/vp8/common/arm/neon/sad16_neon.asm b/vp8/common/arm/neon/sad16_neon.asm
deleted file mode 100644
index 7197e5655..000000000
--- a/vp8/common/arm/neon/sad16_neon.asm
+++ /dev/null
@@ -1,212 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_sad16x16_neon|
- EXPORT |vp8_sad16x8_neon|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *src_ptr
-; r1 int src_stride
-; r2 unsigned char *ref_ptr
-; r3 int ref_stride
-|vp8_sad16x16_neon| PROC
-;;
- vpush {d8-d15}
- vld1.8 {q0}, [r0], r1
- vld1.8 {q4}, [r2], r3
-
- vld1.8 {q1}, [r0], r1
- vld1.8 {q5}, [r2], r3
-
- vabdl.u8 q12, d0, d8
- vabdl.u8 q13, d1, d9
-
- vld1.8 {q2}, [r0], r1
- vld1.8 {q6}, [r2], r3
-
- vabal.u8 q12, d2, d10
- vabal.u8 q13, d3, d11
-
- vld1.8 {q3}, [r0], r1
- vld1.8 {q7}, [r2], r3
-
- vabal.u8 q12, d4, d12
- vabal.u8 q13, d5, d13
-
-;;
- vld1.8 {q0}, [r0], r1
- vld1.8 {q4}, [r2], r3
-
- vabal.u8 q12, d6, d14
- vabal.u8 q13, d7, d15
-
- vld1.8 {q1}, [r0], r1
- vld1.8 {q5}, [r2], r3
-
- vabal.u8 q12, d0, d8
- vabal.u8 q13, d1, d9
-
- vld1.8 {q2}, [r0], r1
- vld1.8 {q6}, [r2], r3
-
- vabal.u8 q12, d2, d10
- vabal.u8 q13, d3, d11
-
- vld1.8 {q3}, [r0], r1
- vld1.8 {q7}, [r2], r3
-
- vabal.u8 q12, d4, d12
- vabal.u8 q13, d5, d13
-
-;;
- vld1.8 {q0}, [r0], r1
- vld1.8 {q4}, [r2], r3
-
- vabal.u8 q12, d6, d14
- vabal.u8 q13, d7, d15
-
- vld1.8 {q1}, [r0], r1
- vld1.8 {q5}, [r2], r3
-
- vabal.u8 q12, d0, d8
- vabal.u8 q13, d1, d9
-
- vld1.8 {q2}, [r0], r1
- vld1.8 {q6}, [r2], r3
-
- vabal.u8 q12, d2, d10
- vabal.u8 q13, d3, d11
-
- vld1.8 {q3}, [r0], r1
- vld1.8 {q7}, [r2], r3
-
- vabal.u8 q12, d4, d12
- vabal.u8 q13, d5, d13
-
-;;
- vld1.8 {q0}, [r0], r1
- vld1.8 {q4}, [r2], r3
-
- vabal.u8 q12, d6, d14
- vabal.u8 q13, d7, d15
-
- vld1.8 {q1}, [r0], r1
- vld1.8 {q5}, [r2], r3
-
- vabal.u8 q12, d0, d8
- vabal.u8 q13, d1, d9
-
- vld1.8 {q2}, [r0], r1
- vld1.8 {q6}, [r2], r3
-
- vabal.u8 q12, d2, d10
- vabal.u8 q13, d3, d11
-
- vld1.8 {q3}, [r0]
- vld1.8 {q7}, [r2]
-
- vabal.u8 q12, d4, d12
- vabal.u8 q13, d5, d13
-
- vabal.u8 q12, d6, d14
- vabal.u8 q13, d7, d15
-
- vadd.u16 q0, q12, q13
-
- vpaddl.u16 q1, q0
- vpaddl.u32 q0, q1
-
- vadd.u32 d0, d0, d1
-
- vmov.32 r0, d0[0]
-
- vpop {d8-d15}
- bx lr
-
- ENDP
-
-;==============================
-;unsigned int vp8_sad16x8_c(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-|vp8_sad16x8_neon| PROC
- vpush {d8-d15}
-
- vld1.8 {q0}, [r0], r1
- vld1.8 {q4}, [r2], r3
-
- vld1.8 {q1}, [r0], r1
- vld1.8 {q5}, [r2], r3
-
- vabdl.u8 q12, d0, d8
- vabdl.u8 q13, d1, d9
-
- vld1.8 {q2}, [r0], r1
- vld1.8 {q6}, [r2], r3
-
- vabal.u8 q12, d2, d10
- vabal.u8 q13, d3, d11
-
- vld1.8 {q3}, [r0], r1
- vld1.8 {q7}, [r2], r3
-
- vabal.u8 q12, d4, d12
- vabal.u8 q13, d5, d13
-
- vld1.8 {q0}, [r0], r1
- vld1.8 {q4}, [r2], r3
-
- vabal.u8 q12, d6, d14
- vabal.u8 q13, d7, d15
-
- vld1.8 {q1}, [r0], r1
- vld1.8 {q5}, [r2], r3
-
- vabal.u8 q12, d0, d8
- vabal.u8 q13, d1, d9
-
- vld1.8 {q2}, [r0], r1
- vld1.8 {q6}, [r2], r3
-
- vabal.u8 q12, d2, d10
- vabal.u8 q13, d3, d11
-
- vld1.8 {q3}, [r0], r1
- vld1.8 {q7}, [r2], r3
-
- vabal.u8 q12, d4, d12
- vabal.u8 q13, d5, d13
-
- vabal.u8 q12, d6, d14
- vabal.u8 q13, d7, d15
-
- vadd.u16 q0, q12, q13
-
- vpaddl.u16 q1, q0
- vpaddl.u32 q0, q1
-
- vadd.u32 d0, d0, d1
-
- vmov.32 r0, d0[0]
-
- vpop {d8-d15}
- bx lr
-
- ENDP
-
- END
diff --git a/vp8/common/arm/neon/sad8_neon.asm b/vp8/common/arm/neon/sad8_neon.asm
deleted file mode 100644
index 6b849d933..000000000
--- a/vp8/common/arm/neon/sad8_neon.asm
+++ /dev/null
@@ -1,215 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_sad8x8_neon|
- EXPORT |vp8_sad8x16_neon|
- EXPORT |vp8_sad4x4_neon|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-; unsigned int vp8_sad8x8_c(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-
-|vp8_sad8x8_neon| PROC
- vpush {d8-d15}
- vld1.8 {d0}, [r0], r1
- vld1.8 {d8}, [r2], r3
-
- vld1.8 {d2}, [r0], r1
- vld1.8 {d10}, [r2], r3
-
- vabdl.u8 q12, d0, d8
-
- vld1.8 {d4}, [r0], r1
- vld1.8 {d12}, [r2], r3
-
- vabal.u8 q12, d2, d10
-
- vld1.8 {d6}, [r0], r1
- vld1.8 {d14}, [r2], r3
-
- vabal.u8 q12, d4, d12
-
- vld1.8 {d0}, [r0], r1
- vld1.8 {d8}, [r2], r3
-
- vabal.u8 q12, d6, d14
-
- vld1.8 {d2}, [r0], r1
- vld1.8 {d10}, [r2], r3
-
- vabal.u8 q12, d0, d8
-
- vld1.8 {d4}, [r0], r1
- vld1.8 {d12}, [r2], r3
-
- vabal.u8 q12, d2, d10
-
- vld1.8 {d6}, [r0], r1
- vld1.8 {d14}, [r2], r3
-
- vabal.u8 q12, d4, d12
- vabal.u8 q12, d6, d14
-
- vpaddl.u16 q1, q12
- vpaddl.u32 q0, q1
- vadd.u32 d0, d0, d1
-
- vmov.32 r0, d0[0]
-
- vpop {d8-d15}
- bx lr
-
- ENDP
-
-;============================
-;unsigned int vp8_sad8x16_c(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-
-|vp8_sad8x16_neon| PROC
- vpush {d8-d15}
- vld1.8 {d0}, [r0], r1
- vld1.8 {d8}, [r2], r3
-
- vld1.8 {d2}, [r0], r1
- vld1.8 {d10}, [r2], r3
-
- vabdl.u8 q12, d0, d8
-
- vld1.8 {d4}, [r0], r1
- vld1.8 {d12}, [r2], r3
-
- vabal.u8 q12, d2, d10
-
- vld1.8 {d6}, [r0], r1
- vld1.8 {d14}, [r2], r3
-
- vabal.u8 q12, d4, d12
-
- vld1.8 {d0}, [r0], r1
- vld1.8 {d8}, [r2], r3
-
- vabal.u8 q12, d6, d14
-
- vld1.8 {d2}, [r0], r1
- vld1.8 {d10}, [r2], r3
-
- vabal.u8 q12, d0, d8
-
- vld1.8 {d4}, [r0], r1
- vld1.8 {d12}, [r2], r3
-
- vabal.u8 q12, d2, d10
-
- vld1.8 {d6}, [r0], r1
- vld1.8 {d14}, [r2], r3
-
- vabal.u8 q12, d4, d12
-
- vld1.8 {d0}, [r0], r1
- vld1.8 {d8}, [r2], r3
-
- vabal.u8 q12, d6, d14
-
- vld1.8 {d2}, [r0], r1
- vld1.8 {d10}, [r2], r3
-
- vabal.u8 q12, d0, d8
-
- vld1.8 {d4}, [r0], r1
- vld1.8 {d12}, [r2], r3
-
- vabal.u8 q12, d2, d10
-
- vld1.8 {d6}, [r0], r1
- vld1.8 {d14}, [r2], r3
-
- vabal.u8 q12, d4, d12
-
- vld1.8 {d0}, [r0], r1
- vld1.8 {d8}, [r2], r3
-
- vabal.u8 q12, d6, d14
-
- vld1.8 {d2}, [r0], r1
- vld1.8 {d10}, [r2], r3
-
- vabal.u8 q12, d0, d8
-
- vld1.8 {d4}, [r0], r1
- vld1.8 {d12}, [r2], r3
-
- vabal.u8 q12, d2, d10
-
- vld1.8 {d6}, [r0], r1
- vld1.8 {d14}, [r2], r3
-
- vabal.u8 q12, d4, d12
- vabal.u8 q12, d6, d14
-
- vpaddl.u16 q1, q12
- vpaddl.u32 q0, q1
- vadd.u32 d0, d0, d1
-
- vmov.32 r0, d0[0]
-
- vpop {d8-d15}
- bx lr
-
- ENDP
-
-;===========================
-;unsigned int vp8_sad4x4_c(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-
-|vp8_sad4x4_neon| PROC
- vpush {d8-d15}
- vld1.8 {d0}, [r0], r1
- vld1.8 {d8}, [r2], r3
-
- vld1.8 {d2}, [r0], r1
- vld1.8 {d10}, [r2], r3
-
- vabdl.u8 q12, d0, d8
-
- vld1.8 {d4}, [r0], r1
- vld1.8 {d12}, [r2], r3
-
- vabal.u8 q12, d2, d10
-
- vld1.8 {d6}, [r0], r1
- vld1.8 {d14}, [r2], r3
-
- vabal.u8 q12, d4, d12
- vabal.u8 q12, d6, d14
-
- vpaddl.u16 d1, d24
- vpaddl.u32 d0, d1
- vmov.32 r0, d0[0]
-
- vpop {d8-d15}
- bx lr
-
- ENDP
-
- END
diff --git a/vp8/common/arm/neon/sad_neon.c b/vp8/common/arm/neon/sad_neon.c
new file mode 100644
index 000000000..6595ac051
--- /dev/null
+++ b/vp8/common/arm/neon/sad_neon.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+unsigned int vp8_sad8x8_neon(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride) {
+ uint8x8_t d0, d8;
+ uint16x8_t q12;
+ uint32x4_t q1;
+ uint64x2_t q3;
+ uint32x2_t d5;
+ int i;
+
+ d0 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d8 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabdl_u8(d0, d8);
+
+ for (i = 0; i < 7; i++) {
+ d0 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d8 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabal_u8(q12, d0, d8);
+ }
+
+ q1 = vpaddlq_u16(q12);
+ q3 = vpaddlq_u32(q1);
+ d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+ vreinterpret_u32_u64(vget_high_u64(q3)));
+
+ return vget_lane_u32(d5, 0);
+}
+
+unsigned int vp8_sad8x16_neon(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride) {
+ uint8x8_t d0, d8;
+ uint16x8_t q12;
+ uint32x4_t q1;
+ uint64x2_t q3;
+ uint32x2_t d5;
+ int i;
+
+ d0 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d8 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabdl_u8(d0, d8);
+
+ for (i = 0; i < 15; i++) {
+ d0 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d8 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabal_u8(q12, d0, d8);
+ }
+
+ q1 = vpaddlq_u16(q12);
+ q3 = vpaddlq_u32(q1);
+ d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+ vreinterpret_u32_u64(vget_high_u64(q3)));
+
+ return vget_lane_u32(d5, 0);
+}
+
+unsigned int vp8_sad4x4_neon(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride) {
+ uint8x8_t d0, d8;
+ uint16x8_t q12;
+ uint32x2_t d1;
+ uint64x1_t d3;
+ int i;
+
+ d0 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d8 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabdl_u8(d0, d8);
+
+ for (i = 0; i < 3; i++) {
+ d0 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d8 = vld1_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabal_u8(q12, d0, d8);
+ }
+
+ d1 = vpaddl_u16(vget_low_u16(q12));
+ d3 = vpaddl_u32(d1);
+
+ return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
+}
+
+unsigned int vp8_sad16x16_neon(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride) {
+ uint8x16_t q0, q4;
+ uint16x8_t q12, q13;
+ uint32x4_t q1;
+ uint64x2_t q3;
+ uint32x2_t d5;
+ int i;
+
+ q0 = vld1q_u8(src_ptr);
+ src_ptr += src_stride;
+ q4 = vld1q_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
+ q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
+
+ for (i = 0; i < 15; i++) {
+ q0 = vld1q_u8(src_ptr);
+ src_ptr += src_stride;
+ q4 = vld1q_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
+ q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
+ }
+
+ q12 = vaddq_u16(q12, q13);
+ q1 = vpaddlq_u16(q12);
+ q3 = vpaddlq_u32(q1);
+ d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+ vreinterpret_u32_u64(vget_high_u64(q3)));
+
+ return vget_lane_u32(d5, 0);
+}
+
+unsigned int vp8_sad16x8_neon(
+ unsigned char *src_ptr,
+ int src_stride,
+ unsigned char *ref_ptr,
+ int ref_stride) {
+ uint8x16_t q0, q4;
+ uint16x8_t q12, q13;
+ uint32x4_t q1;
+ uint64x2_t q3;
+ uint32x2_t d5;
+ int i;
+
+ q0 = vld1q_u8(src_ptr);
+ src_ptr += src_stride;
+ q4 = vld1q_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
+ q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
+
+ for (i = 0; i < 7; i++) {
+ q0 = vld1q_u8(src_ptr);
+ src_ptr += src_stride;
+ q4 = vld1q_u8(ref_ptr);
+ ref_ptr += ref_stride;
+ q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
+ q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
+ }
+
+ q12 = vaddq_u16(q12, q13);
+ q1 = vpaddlq_u16(q12);
+ q3 = vpaddlq_u32(q1);
+ d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+ vreinterpret_u32_u64(vget_high_u64(q3)));
+
+ return vget_lane_u32(d5, 0);
+}
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 1b658352b..964e378e0 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -160,8 +160,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_
# common (neon)
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sad8_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sad16_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict4x4_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict8x4_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict8x8_neon$(ASM)
@@ -186,6 +184,7 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfilter_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimplehorizontaledge_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimpleverticaledge_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sad_neon.c
$(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))