diff options
author | Frank Galligan <fgalligan@google.com> | 2015-01-15 14:36:41 -0800 |
---|---|---|
committer | Frank Galligan <fgalligan@google.com> | 2015-01-15 15:32:40 -0800 |
commit | 6e7e1cf32f85f91ddfcb49a807e598e8ead131fe (patch) | |
tree | 5bfb5a1cfae1fa515b35d04d117772052edcdbcc | |
parent | 1b7391a3a432bf2164b667f8187524494506ab92 (diff) | |
download | libvpx-6e7e1cf32f85f91ddfcb49a807e598e8ead131fe.tar libvpx-6e7e1cf32f85f91ddfcb49a807e598e8ead131fe.tar.gz libvpx-6e7e1cf32f85f91ddfcb49a807e598e8ead131fe.tar.bz2 libvpx-6e7e1cf32f85f91ddfcb49a807e598e8ead131fe.zip |
Add Neon intrinsics for vp9_avg_8x8_neon
On Nexus 7 speed -5, -6, -7, and -8 saw about a 1% increase
in perf for 480p. Speeds -5, -6, -7, and -8 saw about a 1.5%
increase in perf for 720p.
Tested on Nexus 7, built with ndk r10d, gcc 4.9.
Change-Id: Ibf17ebfd952a6aec941719bd8306df8ec4574bee
-rw-r--r-- | test/vp9_avg_test.cc | 10 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.pl | 2 | ||||
-rw-r--r-- | vp9/encoder/arm/neon/vp9_avg_neon.c | 49 | ||||
-rw-r--r-- | vp9/vp9cx.mk | 1 |
4 files changed, 61 insertions, 1 deletions
diff --git a/test/vp9_avg_test.cc b/test/vp9_avg_test.cc index fa04528a2..252ed4efa 100644 --- a/test/vp9_avg_test.cc +++ b/test/vp9_avg_test.cc @@ -165,4 +165,14 @@ INSTANTIATE_TEST_CASE_P( #endif +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P( + NEON, AverageTest, + ::testing::Values( + make_tuple(16, 16, 0, 8, &vp9_avg_8x8_neon), + make_tuple(16, 16, 5, 8, &vp9_avg_8x8_neon), + make_tuple(32, 32, 15, 8, &vp9_avg_8x8_neon))); + +#endif + } // namespace diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 11df21f07..4e9ec0f56 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -1098,7 +1098,7 @@ add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *"; specialize qw/vp9_get_mb_ss/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p"; -specialize qw/vp9_avg_8x8 sse2/; +specialize qw/vp9_avg_8x8 sse2 neon/; add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p"; specialize qw/vp9_avg_4x4 sse2/; diff --git a/vp9/encoder/arm/neon/vp9_avg_neon.c b/vp9/encoder/arm/neon/vp9_avg_neon.c new file mode 100644 index 000000000..f505fcb7a --- /dev/null +++ b/vp9/encoder/arm/neon/vp9_avg_neon.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vp9_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" + +static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) { + const uint32x4_t a = vpaddlq_u16(v_16x8); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +} + +unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) { + uint8x8_t v_s0 = vld1_u8(s); + const uint8x8_t v_s1 = vld1_u8(s + p); + uint16x8_t v_sum = vaddl_u8(v_s0, v_s1); + + v_s0 = vld1_u8(s + 2 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 3 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 4 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 5 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 6 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 7 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + return (horizontal_add_u16x8(v_sum) + 32) >> 6; +} diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index c75fd8a01..33a1e6735 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -150,6 +150,7 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c |