diff options
author | James Zern <jzern@google.com> | 2016-10-15 11:04:37 -0700 |
---|---|---|
committer | James Zern <jzern@google.com> | 2016-10-17 17:15:03 -0700 |
commit | 68cd3052cadad08fa85b7f02a4f303a4418e1d25 (patch) | |
tree | 39dc4ab25a37b5f6bc7e8c686b898760baf320a6 /vpx_dsp | |
parent | 808a560be6a71dd0d27f29c26f9fefb5e809d373 (diff) | |
download | libvpx-68cd3052cadad08fa85b7f02a4f303a4418e1d25.tar libvpx-68cd3052cadad08fa85b7f02a4f303a4418e1d25.tar.gz libvpx-68cd3052cadad08fa85b7f02a4f303a4418e1d25.tar.bz2 libvpx-68cd3052cadad08fa85b7f02a4f303a4418e1d25.zip |
vpx_highbd_convolve_copy_neon: use multi reg loads
for copy16/32/64
BUG=webm:1299
Change-Id: I5080d736bde7e487c80ef3d7024dda1e96a57eaf
Diffstat (limited to 'vpx_dsp')
-rw-r--r-- | vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c | 57 |
1 files changed, 34 insertions, 23 deletions
diff --git a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c index 68d57779b..a980ab1a3 100644 --- a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c +++ b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c @@ -49,44 +49,55 @@ void vpx_highbd_convolve_copy_neon(const uint8_t *src8, ptrdiff_t src_stride, } while (h > 0); } else if (w < 32) { // copy16 do { - vst1q_u16(dst, vld1q_u16(src)); - vst1q_u16(dst + 8, vld1q_u16(src + 8)); + vst2q_u16(dst, vld2q_u16(src)); src += src_stride; dst += dst_stride; - vst1q_u16(dst, vld1q_u16(src)); - vst1q_u16(dst + 8, vld1q_u16(src + 8)); + vst2q_u16(dst, vld2q_u16(src)); src += src_stride; dst += dst_stride; - h -= 2; + vst2q_u16(dst, vld2q_u16(src)); + src += src_stride; + dst += dst_stride; + vst2q_u16(dst, vld2q_u16(src)); + src += src_stride; + dst += dst_stride; + h -= 4; } while (h > 0); } else if (w == 32) { // copy32 do { - vst1q_u16(dst, vld1q_u16(src)); - vst1q_u16(dst + 8, vld1q_u16(src + 8)); - vst1q_u16(dst + 16, vld1q_u16(src + 16)); - vst1q_u16(dst + 24, vld1q_u16(src + 24)); + vst4q_u16(dst, vld4q_u16(src)); src += src_stride; dst += dst_stride; - vst1q_u16(dst, vld1q_u16(src)); - vst1q_u16(dst + 8, vld1q_u16(src + 8)); - vst1q_u16(dst + 16, vld1q_u16(src + 16)); - vst1q_u16(dst + 24, vld1q_u16(src + 24)); + vst4q_u16(dst, vld4q_u16(src)); src += src_stride; dst += dst_stride; - h -= 2; + vst4q_u16(dst, vld4q_u16(src)); + src += src_stride; + dst += dst_stride; + vst4q_u16(dst, vld4q_u16(src)); + src += src_stride; + dst += dst_stride; + h -= 4; } while (h > 0); } else { // copy64 do { - vst1q_u16(dst, vld1q_u16(src)); - vst1q_u16(dst + 8, vld1q_u16(src + 8)); - vst1q_u16(dst + 16, vld1q_u16(src + 16)); - vst1q_u16(dst + 24, vld1q_u16(src + 24)); - vst1q_u16(dst + 32, vld1q_u16(src + 32)); - vst1q_u16(dst + 40, vld1q_u16(src + 40)); - vst1q_u16(dst + 48, vld1q_u16(src + 48)); - vst1q_u16(dst + 56, vld1q_u16(src + 56)); + vst4q_u16(dst, vld4q_u16(src)); + vst4q_u16(dst + 32, vld4q_u16(src + 32)); + src += src_stride; + dst += dst_stride; + vst4q_u16(dst, vld4q_u16(src)); + vst4q_u16(dst + 32, vld4q_u16(src + 32)); src += src_stride; dst += dst_stride; - } while (--h); + vst4q_u16(dst, vld4q_u16(src)); + vst4q_u16(dst + 32, vld4q_u16(src + 32)); + src += src_stride; + dst += dst_stride; + vst4q_u16(dst, vld4q_u16(src)); + vst4q_u16(dst + 32, vld4q_u16(src + 32)); + src += src_stride; + dst += dst_stride; + h -= 4; + } while (h > 0); } } |