summaryrefslogtreecommitdiff
path: root/vpx_dsp
diff options
context:
space:
mode:
authorJames Zern <jzern@google.com>2016-10-22 02:28:15 +0000
committerGerrit Code Review <noreply-gerritcodereview@google.com>2016-10-22 02:28:15 +0000
commit5d91752a98e755c6d62ef5be1b106d77e5ed3c66 (patch)
tree576dee862ddb60a90dd334b39cb2f6d509e4a9be /vpx_dsp
parent9a032fa2628ee55ec557d2c03fb09ead796eea3d (diff)
parent68cd3052cadad08fa85b7f02a4f303a4418e1d25 (diff)
downloadlibvpx-5d91752a98e755c6d62ef5be1b106d77e5ed3c66.tar
libvpx-5d91752a98e755c6d62ef5be1b106d77e5ed3c66.tar.gz
libvpx-5d91752a98e755c6d62ef5be1b106d77e5ed3c66.tar.bz2
libvpx-5d91752a98e755c6d62ef5be1b106d77e5ed3c66.zip
Merge "vpx_highbd_convolve_copy_neon: use multi reg loads"
Diffstat (limited to 'vpx_dsp')
-rw-r--r--vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c57
1 files changed, 34 insertions, 23 deletions
diff --git a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
index 68d57779b..a980ab1a3 100644
--- a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
+++ b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
@@ -49,44 +49,55 @@ void vpx_highbd_convolve_copy_neon(const uint8_t *src8, ptrdiff_t src_stride,
} while (h > 0);
} else if (w < 32) { // copy16
do {
- vst1q_u16(dst, vld1q_u16(src));
- vst1q_u16(dst + 8, vld1q_u16(src + 8));
+ vst2q_u16(dst, vld2q_u16(src));
src += src_stride;
dst += dst_stride;
- vst1q_u16(dst, vld1q_u16(src));
- vst1q_u16(dst + 8, vld1q_u16(src + 8));
+ vst2q_u16(dst, vld2q_u16(src));
src += src_stride;
dst += dst_stride;
- h -= 2;
+ vst2q_u16(dst, vld2q_u16(src));
+ src += src_stride;
+ dst += dst_stride;
+ vst2q_u16(dst, vld2q_u16(src));
+ src += src_stride;
+ dst += dst_stride;
+ h -= 4;
} while (h > 0);
} else if (w == 32) { // copy32
do {
- vst1q_u16(dst, vld1q_u16(src));
- vst1q_u16(dst + 8, vld1q_u16(src + 8));
- vst1q_u16(dst + 16, vld1q_u16(src + 16));
- vst1q_u16(dst + 24, vld1q_u16(src + 24));
+ vst4q_u16(dst, vld4q_u16(src));
src += src_stride;
dst += dst_stride;
- vst1q_u16(dst, vld1q_u16(src));
- vst1q_u16(dst + 8, vld1q_u16(src + 8));
- vst1q_u16(dst + 16, vld1q_u16(src + 16));
- vst1q_u16(dst + 24, vld1q_u16(src + 24));
+ vst4q_u16(dst, vld4q_u16(src));
src += src_stride;
dst += dst_stride;
- h -= 2;
+ vst4q_u16(dst, vld4q_u16(src));
+ src += src_stride;
+ dst += dst_stride;
+ vst4q_u16(dst, vld4q_u16(src));
+ src += src_stride;
+ dst += dst_stride;
+ h -= 4;
} while (h > 0);
} else { // copy64
do {
- vst1q_u16(dst, vld1q_u16(src));
- vst1q_u16(dst + 8, vld1q_u16(src + 8));
- vst1q_u16(dst + 16, vld1q_u16(src + 16));
- vst1q_u16(dst + 24, vld1q_u16(src + 24));
- vst1q_u16(dst + 32, vld1q_u16(src + 32));
- vst1q_u16(dst + 40, vld1q_u16(src + 40));
- vst1q_u16(dst + 48, vld1q_u16(src + 48));
- vst1q_u16(dst + 56, vld1q_u16(src + 56));
+ vst4q_u16(dst, vld4q_u16(src));
+ vst4q_u16(dst + 32, vld4q_u16(src + 32));
+ src += src_stride;
+ dst += dst_stride;
+ vst4q_u16(dst, vld4q_u16(src));
+ vst4q_u16(dst + 32, vld4q_u16(src + 32));
src += src_stride;
dst += dst_stride;
- } while (--h);
+ vst4q_u16(dst, vld4q_u16(src));
+ vst4q_u16(dst + 32, vld4q_u16(src + 32));
+ src += src_stride;
+ dst += dst_stride;
+ vst4q_u16(dst, vld4q_u16(src));
+ vst4q_u16(dst + 32, vld4q_u16(src + 32));
+ src += src_stride;
+ dst += dst_stride;
+ h -= 4;
+ } while (h > 0);
}
}