summaryrefslogtreecommitdiff
path: root/vpx_dsp
diff options
context:
space:
mode:
authorJames Zern <jzern@google.com>2016-10-15 11:04:37 -0700
committerJames Zern <jzern@google.com>2016-10-17 17:15:03 -0700
commit68cd3052cadad08fa85b7f02a4f303a4418e1d25 (patch)
tree39dc4ab25a37b5f6bc7e8c686b898760baf320a6 /vpx_dsp
parent808a560be6a71dd0d27f29c26f9fefb5e809d373 (diff)
downloadlibvpx-68cd3052cadad08fa85b7f02a4f303a4418e1d25.tar
libvpx-68cd3052cadad08fa85b7f02a4f303a4418e1d25.tar.gz
libvpx-68cd3052cadad08fa85b7f02a4f303a4418e1d25.tar.bz2
libvpx-68cd3052cadad08fa85b7f02a4f303a4418e1d25.zip
vpx_highbd_convolve_copy_neon: use multi reg loads
for copy16/32/64 BUG=webm:1299 Change-Id: I5080d736bde7e487c80ef3d7024dda1e96a57eaf
Diffstat (limited to 'vpx_dsp')
-rw-r--r--vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c57
1 files changed, 34 insertions, 23 deletions
diff --git a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
index 68d57779b..a980ab1a3 100644
--- a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
+++ b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
@@ -49,44 +49,55 @@ void vpx_highbd_convolve_copy_neon(const uint8_t *src8, ptrdiff_t src_stride,
} while (h > 0);
} else if (w < 32) { // copy16
do {
- vst1q_u16(dst, vld1q_u16(src));
- vst1q_u16(dst + 8, vld1q_u16(src + 8));
+ vst2q_u16(dst, vld2q_u16(src));
src += src_stride;
dst += dst_stride;
- vst1q_u16(dst, vld1q_u16(src));
- vst1q_u16(dst + 8, vld1q_u16(src + 8));
+ vst2q_u16(dst, vld2q_u16(src));
src += src_stride;
dst += dst_stride;
- h -= 2;
+ vst2q_u16(dst, vld2q_u16(src));
+ src += src_stride;
+ dst += dst_stride;
+ vst2q_u16(dst, vld2q_u16(src));
+ src += src_stride;
+ dst += dst_stride;
+ h -= 4;
} while (h > 0);
} else if (w == 32) { // copy32
do {
- vst1q_u16(dst, vld1q_u16(src));
- vst1q_u16(dst + 8, vld1q_u16(src + 8));
- vst1q_u16(dst + 16, vld1q_u16(src + 16));
- vst1q_u16(dst + 24, vld1q_u16(src + 24));
+ vst4q_u16(dst, vld4q_u16(src));
src += src_stride;
dst += dst_stride;
- vst1q_u16(dst, vld1q_u16(src));
- vst1q_u16(dst + 8, vld1q_u16(src + 8));
- vst1q_u16(dst + 16, vld1q_u16(src + 16));
- vst1q_u16(dst + 24, vld1q_u16(src + 24));
+ vst4q_u16(dst, vld4q_u16(src));
src += src_stride;
dst += dst_stride;
- h -= 2;
+ vst4q_u16(dst, vld4q_u16(src));
+ src += src_stride;
+ dst += dst_stride;
+ vst4q_u16(dst, vld4q_u16(src));
+ src += src_stride;
+ dst += dst_stride;
+ h -= 4;
} while (h > 0);
} else { // copy64
do {
- vst1q_u16(dst, vld1q_u16(src));
- vst1q_u16(dst + 8, vld1q_u16(src + 8));
- vst1q_u16(dst + 16, vld1q_u16(src + 16));
- vst1q_u16(dst + 24, vld1q_u16(src + 24));
- vst1q_u16(dst + 32, vld1q_u16(src + 32));
- vst1q_u16(dst + 40, vld1q_u16(src + 40));
- vst1q_u16(dst + 48, vld1q_u16(src + 48));
- vst1q_u16(dst + 56, vld1q_u16(src + 56));
+ vst4q_u16(dst, vld4q_u16(src));
+ vst4q_u16(dst + 32, vld4q_u16(src + 32));
+ src += src_stride;
+ dst += dst_stride;
+ vst4q_u16(dst, vld4q_u16(src));
+ vst4q_u16(dst + 32, vld4q_u16(src + 32));
src += src_stride;
dst += dst_stride;
- } while (--h);
+ vst4q_u16(dst, vld4q_u16(src));
+ vst4q_u16(dst + 32, vld4q_u16(src + 32));
+ src += src_stride;
+ dst += dst_stride;
+ vst4q_u16(dst, vld4q_u16(src));
+ vst4q_u16(dst + 32, vld4q_u16(src + 32));
+ src += src_stride;
+ dst += dst_stride;
+ h -= 4;
+ } while (h > 0);
}
}