diff options
author | James Yu <james.yu@linaro.org> | 2014-01-21 17:23:27 +0800 |
---|---|---|
committer | Johann <johannkoenig@google.com> | 2014-12-09 20:02:46 -0800 |
commit | d12757f5c69a7c69bdf8035282348363334ab2f3 (patch) | |
tree | 72b1d8643d3ff06e3925f609c126a093b6a4023c /vp9/common | |
parent | 617382a2e3b4a8b053a8032a1a89be0dcf8e349c (diff) | |
download | libvpx-d12757f5c69a7c69bdf8035282348363334ab2f3.tar libvpx-d12757f5c69a7c69bdf8035282348363334ab2f3.tar.gz libvpx-d12757f5c69a7c69bdf8035282348363334ab2f3.tar.bz2 libvpx-d12757f5c69a7c69bdf8035282348363334ab2f3.zip |
VP9 common for ARMv8 by using NEON intrinsics 03
Add vp9_copy_neon.c
- vp9_convolve_copy_neon
Change-Id: I291fc5423d06240876411bbceab03eae5ef585be
Signed-off-by: James Yu <james.yu@linaro.org>
Diffstat (limited to 'vp9/common')
-rw-r--r-- | vp9/common/arm/neon/vp9_copy_neon.c | 92 | ||||
-rw-r--r-- | vp9/common/arm/neon/vp9_copy_neon_asm.asm (renamed from vp9/common/arm/neon/vp9_copy_neon.asm) | 0 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.pl | 3 |
3 files changed, 93 insertions, 2 deletions
diff --git a/vp9/common/arm/neon/vp9_copy_neon.c b/vp9/common/arm/neon/vp9_copy_neon.c new file mode 100644 index 000000000..f334abe11 --- /dev/null +++ b/vp9/common/arm/neon/vp9_copy_neon.c @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stddef.h> +#include <arm_neon.h> + +void vp9_convolve_copy_neon( + const uint8_t *src, // r0 + ptrdiff_t src_stride, // r1 + uint8_t *dst, // r2 + ptrdiff_t dst_stride, // r3 + const int16_t *filter_x, + int filter_x_stride, + const int16_t *filter_y, + int filter_y_stride, + int w, + int h) { + uint8x8_t d0u8, d2u8; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + (void)filter_x; (void)filter_x_stride; + (void)filter_y; (void)filter_y_stride; + + if (w > 32) { // copy64 + for (; h > 0; h--) { + q0u8 = vld1q_u8(src); + q1u8 = vld1q_u8(src + 16); + q2u8 = vld1q_u8(src + 32); + q3u8 = vld1q_u8(src + 48); + src += src_stride; + + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + vst1q_u8(dst + 32, q2u8); + vst1q_u8(dst + 48, q3u8); + dst += dst_stride; + } + } else if (w == 32) { // copy32 + for (; h > 0; h -= 2) { + q0u8 = vld1q_u8(src); + q1u8 = vld1q_u8(src + 16); + src += src_stride; + q2u8 = vld1q_u8(src); + q3u8 = vld1q_u8(src + 16); + src += src_stride; + + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + dst += dst_stride; + vst1q_u8(dst, q2u8); + vst1q_u8(dst + 16, q3u8); + dst += dst_stride; + } + } else if (w > 8) { // copy16 + for (; h > 0; h -= 2) { + q0u8 = vld1q_u8(src); + src += src_stride; + q1u8 = vld1q_u8(src); + src += src_stride; + + vst1q_u8(dst, q0u8); + dst += dst_stride; + vst1q_u8(dst, q1u8); + dst += dst_stride; + } + } else if (w == 8) { // copy8 + for (; h > 0; h -= 2) { + d0u8 = vld1_u8(src); + src += src_stride; + d2u8 = vld1_u8(src); + src += src_stride; + + vst1_u8(dst, d0u8); + dst += dst_stride; + vst1_u8(dst, d2u8); + dst += dst_stride; + } + } else { // copy4 + for (; h > 0; h--) { + *(uint32_t *)dst = *(const uint32_t *)src; + src += src_stride; + dst += dst_stride; + } + } + return; +} diff --git a/vp9/common/arm/neon/vp9_copy_neon.asm b/vp9/common/arm/neon/vp9_copy_neon_asm.asm index a0bd04a35..a0bd04a35 100644 --- a/vp9/common/arm/neon/vp9_copy_neon.asm +++ b/vp9/common/arm/neon/vp9_copy_neon_asm.asm diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 8248b5cec..b9d9627bf 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -289,8 +289,7 @@ $vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt; # Sub Pixel Filters # add_proto qw/void vp9_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -specialize qw/vp9_convolve_copy neon_asm dspr2/, "$sse2_x86inc"; -$vp9_convolve_copy_neon_asm=vp9_convolve_copy_neon; +specialize qw/vp9_convolve_copy neon dspr2/, "$sse2_x86inc"; add_proto qw/void vp9_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; specialize qw/vp9_convolve_avg neon dspr2/, "$sse2_x86inc"; |