aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2016-03-31 10:04:26 -0700
committerH.J. Lu <hjl.tools@gmail.com>2016-04-02 11:28:34 -0700
commit1c5d0b0ba41376c2f0792da4f22cc1f5b2b2688e (patch)
treecc10b9f71133fadcd170d9a704521c46504230a9
parente1203f48239fbb9832db6ed3a0d2a008e317aff9 (diff)
downloadglibc-1c5d0b0ba41376c2f0792da4f22cc1f5b2b2688e.tar
glibc-1c5d0b0ba41376c2f0792da4f22cc1f5b2b2688e.tar.gz
glibc-1c5d0b0ba41376c2f0792da4f22cc1f5b2b2688e.tar.bz2
glibc-1c5d0b0ba41376c2f0792da4f22cc1f5b2b2688e.zip
Add x86-64 memmove with unaligned load/store and rep movsb
Implement x86-64 memmove with unaligned load/store and rep movsb. Support 16-byte, 32-byte and 64-byte vector register sizes. When size <= 8 times of vector register size, there is no check for address overlap bewteen source and destination. Since overhead for overlap check is small when size > 8 times of vector register size, memcpy is an alias of memmove. A single file provides 2 implementations of memmove, one with rep movsb and the other without rep movsb. They share the same codes when size is between 2 times of vector register size and REP_MOVSB_THRESHOLD which is 2KB for 16-byte vector register size and scaled up by large vector register size. Key features: 1. Use overlapping load and store to avoid branch. 2. For size <= 8 times of vector register size, load all sources into registers and store them together. 3. If there is no address overlap bewteen source and destination, copy from both ends with 4 times of vector register size at a time. 4. If address of destination > address of source, backward copy 8 times of vector register size at a time. 5. Otherwise, forward copy 8 times of vector register size at a time. 6. Use rep movsb only for forward copy. Avoid slow backward rep movsb by fallbacking to backward copy 8 times of vector register size at a time. 7. Skip when address of destination == address of source. [BZ #19776] * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add memmove-sse2-unaligned-erms, memmove-avx-unaligned-erms and memmove-avx512-unaligned-erms. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Test __memmove_chk_avx512_unaligned_2, __memmove_chk_avx512_unaligned_erms, __memmove_chk_avx_unaligned_2, __memmove_chk_avx_unaligned_erms, __memmove_chk_sse2_unaligned_2, __memmove_chk_sse2_unaligned_erms, __memmove_avx_unaligned_2, __memmove_avx_unaligned_erms, __memmove_avx512_unaligned_2, __memmove_avx512_unaligned_erms, __memmove_erms, __memmove_sse2_unaligned_2, __memmove_sse2_unaligned_erms, __memcpy_chk_avx512_unaligned_2, __memcpy_chk_avx512_unaligned_erms, __memcpy_chk_avx_unaligned_2, __memcpy_chk_avx_unaligned_erms, __memcpy_chk_sse2_unaligned_2, __memcpy_chk_sse2_unaligned_erms, __memcpy_avx_unaligned_2, __memcpy_avx_unaligned_erms, __memcpy_avx512_unaligned_2, __memcpy_avx512_unaligned_erms, __memcpy_sse2_unaligned_2, __memcpy_sse2_unaligned_erms, __memcpy_erms, __mempcpy_chk_avx512_unaligned_2, __mempcpy_chk_avx512_unaligned_erms, __mempcpy_chk_avx_unaligned_2, __mempcpy_chk_avx_unaligned_erms, __mempcpy_chk_sse2_unaligned_2, __mempcpy_chk_sse2_unaligned_erms, __mempcpy_avx512_unaligned_2, __mempcpy_avx512_unaligned_erms, __mempcpy_avx_unaligned_2, __mempcpy_avx_unaligned_erms, __mempcpy_sse2_unaligned_2, __mempcpy_sse2_unaligned_erms and __mempcpy_erms. * sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S: New file. * sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S: Likwise. * sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S: Likwise. * sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S: Likwise. (cherry picked from commit 88b57b8ed41d5ecf2e1bdfc19556f9246a665ebb)
-rw-r--r--sysdeps/x86_64/multiarch/Makefile5
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-impl-list.c111
-rw-r--r--sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S9
-rw-r--r--sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S11
-rw-r--r--sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S9
-rw-r--r--sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S462
6 files changed, 606 insertions, 1 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index d6b022ee92..1f8d2a370e 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -19,7 +19,10 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
strcat-sse2-unaligned strncat-sse2-unaligned \
strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
- memset-avx512-no-vzeroupper
+ memset-avx512-no-vzeroupper \
+ memmove-sse2-unaligned-erms \
+ memmove-avx-unaligned-erms \
+ memmove-avx512-unaligned-erms
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c varshift
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 98c1582f85..bcf4788b4c 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -48,19 +48,37 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/memmove_chk.c. */
IFUNC_IMPL (i, name, __memmove_chk,
+#ifdef HAVE_AVX512_ASM_SUPPORT
IFUNC_IMPL_ADD (array, i, __memmove_chk,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memmove_chk_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memmove_chk_avx512_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memmove_chk_avx512_unaligned_erms)
+#endif
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
HAS_ARCH_FEATURE (AVX_Usable),
__memmove_chk_avx_unaligned)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __memmove_chk_avx_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __memmove_chk_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
HAS_CPU_FEATURE (SSSE3),
__memmove_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
HAS_CPU_FEATURE (SSSE3),
__memmove_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+ __memmove_chk_sse2_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+ __memmove_chk_sse2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
__memmove_chk_sse2))
/* Support sysdeps/x86_64/multiarch/memmove.S. */
@@ -69,12 +87,31 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX_Usable),
__memmove_avx_unaligned)
IFUNC_IMPL_ADD (array, i, memmove,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __memmove_avx_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, memmove,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __memmove_avx_unaligned_erms)
+#ifdef HAVE_AVX512_ASM_SUPPORT
+ IFUNC_IMPL_ADD (array, i, memmove,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memmove_avx512_no_vzeroupper)
+ IFUNC_IMPL_ADD (array, i, memmove,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memmove_avx512_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, memmove,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memmove_avx512_unaligned_erms)
+#endif
IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
__memmove_ssse3_back)
IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
__memmove_ssse3)
+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
+ IFUNC_IMPL_ADD (array, i, memmove, 1,
+ __memmove_sse2_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, memmove, 1,
+ __memmove_sse2_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2))
#ifdef HAVE_AVX2_SUPPORT
@@ -264,19 +301,37 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
#ifdef SHARED
/* Support sysdeps/x86_64/multiarch/memcpy_chk.S. */
IFUNC_IMPL (i, name, __memcpy_chk,
+#ifdef HAVE_AVX512_ASM_SUPPORT
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memcpy_chk_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memcpy_chk_avx512_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memcpy_chk_avx512_unaligned_erms)
+#endif
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
HAS_ARCH_FEATURE (AVX_Usable),
__memcpy_chk_avx_unaligned)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __memcpy_chk_avx_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __memcpy_chk_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
HAS_CPU_FEATURE (SSSE3),
__memcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
HAS_CPU_FEATURE (SSSE3),
__memcpy_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+ __memcpy_chk_sse2_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+ __memcpy_chk_sse2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_sse2))
/* Support sysdeps/x86_64/multiarch/memcpy.S. */
@@ -284,45 +339,101 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy,
HAS_ARCH_FEATURE (AVX_Usable),
__memcpy_avx_unaligned)
+ IFUNC_IMPL_ADD (array, i, memcpy,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __memcpy_avx_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, memcpy,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __memcpy_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
__memcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
__memcpy_ssse3)
+#ifdef HAVE_AVX512_ASM_SUPPORT
IFUNC_IMPL_ADD (array, i, memcpy,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memcpy_avx512_no_vzeroupper)
+ IFUNC_IMPL_ADD (array, i, memcpy,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memcpy_avx512_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, memcpy,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memcpy_avx512_unaligned_erms)
+#endif
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1,
+ __memcpy_sse2_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1,
+ __memcpy_sse2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
/* Support sysdeps/x86_64/multiarch/mempcpy_chk.S. */
IFUNC_IMPL (i, name, __mempcpy_chk,
+#ifdef HAVE_AVX512_ASM_SUPPORT
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
HAS_ARCH_FEATURE (AVX512F_Usable),
__mempcpy_chk_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __mempcpy_chk_avx512_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __mempcpy_chk_avx512_unaligned_erms)
+#endif
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
HAS_ARCH_FEATURE (AVX_Usable),
__mempcpy_chk_avx_unaligned)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __mempcpy_chk_avx_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __mempcpy_chk_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
HAS_CPU_FEATURE (SSSE3),
__mempcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
HAS_CPU_FEATURE (SSSE3),
__mempcpy_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+ __mempcpy_chk_sse2_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+ __mempcpy_chk_sse2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_sse2))
/* Support sysdeps/x86_64/multiarch/mempcpy.S. */
IFUNC_IMPL (i, name, mempcpy,
+#ifdef HAVE_AVX512_ASM_SUPPORT
IFUNC_IMPL_ADD (array, i, mempcpy,
HAS_ARCH_FEATURE (AVX512F_Usable),
__mempcpy_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, mempcpy,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __mempcpy_avx512_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, mempcpy,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __mempcpy_avx512_unaligned_erms)
+#endif
+ IFUNC_IMPL_ADD (array, i, mempcpy,
HAS_ARCH_FEATURE (AVX_Usable),
__mempcpy_avx_unaligned)
+ IFUNC_IMPL_ADD (array, i, mempcpy,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __mempcpy_avx_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, mempcpy,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __mempcpy_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
__mempcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
__mempcpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, mempcpy, 1,
+ __mempcpy_sse2_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, mempcpy, 1,
+ __mempcpy_sse2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
/* Support sysdeps/x86_64/multiarch/strncmp.S. */
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
new file mode 100644
index 0000000000..3a72c7eafd
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
@@ -0,0 +1,9 @@
+#define VEC_SIZE 32
+#define VEC(i) ymm##i
+#define VMOVU vmovdqu
+#define VMOVA vmovdqa
+
+#define SECTION(p) p##.avx
+#define MEMMOVE_SYMBOL(p,s) p##_avx_##s
+
+#include "memmove-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
new file mode 100644
index 0000000000..38358fa37c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -0,0 +1,11 @@
+#ifdef HAVE_AVX512_ASM_SUPPORT
+# define VEC_SIZE 64
+# define VEC(i) zmm##i
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
+
+# define SECTION(p) p##.avx512
+# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
new file mode 100644
index 0000000000..52b9ae08fc
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
@@ -0,0 +1,9 @@
+#define VEC_SIZE 16
+#define VEC(i) xmm##i
+#define VMOVU movdqu
+#define VMOVA movdqa
+
+#define SECTION(p) p
+#define MEMMOVE_SYMBOL(p,s) p##_sse2_##s
+
+#include "memmove-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
new file mode 100644
index 0000000000..cf645dd7ff
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -0,0 +1,462 @@
+/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
+ Copyright (C) 2016 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* memmove/memcpy/mempcpy is implemented as:
+ 1. Use overlapping load and store to avoid branch.
+ 2. Use 8-bit or 32-bit displacements for branches and nop paddings
+ to avoid long nop between instructions.
+ 3. Load all sources into registers and store them together to avoid
+ possible address overflap between source and destination.
+ 4. If size is 2 * VEC_SIZE or less, load all sources into registers
+ and store them together.
+ 5. If there is no address overflap, copy from both ends with
+ 4 * VEC_SIZE at a time.
+ 6. If size is 8 * VEC_SIZE or less, load all sources into registers
+ and store them together.
+ 7. If address of destination > address of source, backward copy
+ 8 * VEC_SIZE at a time.
+ 8. Otherwise, forward copy 8 * VEC_SIZE at a time. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+# ifndef VZEROUPPER
+# if VEC_SIZE > 16
+# define VZEROUPPER vzeroupper
+# else
+# define VZEROUPPER
+# endif
+# endif
+
+/* Threshold to use Enhanced REP MOVSB. Since there is overhead to set
+ up REP MOVSB operation, REP MOVSB isn't faster on short data. The
+ memcpy micro benchmark in glibc shows that 2KB is the approximate
+ value above which REP MOVSB becomes faster than SSE2 optimization
+ on processors with Enhanced REP MOVSB. Since larger register size
+ can move more data with a single load and store, the threshold is
+ higher with larger register size. */
+# ifndef REP_MOVSB_THRESHOLD
+# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
+# endif
+
+# ifndef SECTION
+# error SECTION is not defined!
+# endif
+ .section SECTION(.text),"ax",@progbits
+
+# ifdef SHARED
+ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2))
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2))
+
+ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_2))
+ movq %rdi, %rax
+ addq %rdx, %rax
+ jmp L(start)
+END (MEMMOVE_SYMBOL (__mempcpy, unaligned_2))
+
+ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2))
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2))
+# endif
+
+ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_2))
+ movq %rdi, %rax
+L(start):
+ cmpq $VEC_SIZE, %rdx
+ jb L(less_vec)
+ cmpq $(VEC_SIZE * 2), %rdx
+ ja L(more_2x_vec)
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
+ VMOVU (%rsi), %VEC(0)
+ VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
+ VZEROUPPER
+ ret
+END (MEMMOVE_SYMBOL (__memmove, unaligned_2))
+
+# ifdef SHARED
+ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms))
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms))
+# endif
+
+ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+ movq %rdi, %rax
+ addq %rdx, %rax
+ jmp L(start_erms)
+END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+
+# ifdef SHARED
+ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms))
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms))
+# endif
+
+# if VEC_SIZE == 16
+/* Only used to measure performance of REP MOVSB. */
+# ifdef SHARED
+ENTRY (__mempcpy_erms)
+ movq %rdi, %rax
+ addq %rdx, %rax
+ jmp L(movsb)
+END (__mempcpy_erms)
+# endif
+
+ENTRY (__memmove_erms)
+ movq %rdi, %rax
+ movq %rdx, %rcx
+ cmpq %rsi, %rdi
+ jbe 1f
+ leaq (%rsi,%rcx), %rdx
+ cmpq %rdx, %rdi
+ jb L(movsb_backward)
+1:
+ rep movsb
+ ret
+L(movsb_backward):
+ leaq -1(%rdi,%rcx), %rdi
+ leaq -1(%rsi,%rcx), %rsi
+ std
+ rep movsb
+ cld
+ ret
+END (__memmove_erms)
+strong_alias (__memmove_erms, __memcpy_erms)
+# endif
+
+ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+ movq %rdi, %rax
+L(start_erms):
+ cmpq $VEC_SIZE, %rdx
+ jb L(less_vec)
+ cmpq $(VEC_SIZE * 2), %rdx
+ ja L(movsb_more_2x_vec)
+L(last_2x_vec):
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
+ VMOVU (%rsi), %VEC(0)
+ VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
+L(return):
+ VZEROUPPER
+ ret
+
+L(movsb):
+ cmpq %rsi, %rdi
+ je L(nop)
+ jb 1f
+ leaq (%rsi,%rdx), %r9
+ cmpq %r9, %rdi
+ /* Avoid slow backward REP MOVSB. */
+# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
+# error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
+# endif
+ jb L(more_8x_vec_backward)
+1:
+ movq %rdx, %rcx
+ rep movsb
+L(nop):
+ ret
+
+ .p2align 4
+L(movsb_more_2x_vec):
+ cmpq $REP_MOVSB_THRESHOLD, %rdx
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ ja.d32 L(movsb)
+ .p2align 4
+L(more_2x_vec):
+ /* More than 2 * VEC. */
+ cmpq %rsi, %rdi
+ je L(nop)
+ jb L(copy_forward)
+ leaq (%rsi,%rdx), %rcx
+ cmpq %rcx, %rdi
+ jb L(more_2x_vec_overlap)
+L(copy_forward):
+ leaq (%rdi,%rdx), %rcx
+ cmpq %rcx, %rsi
+ jb L(more_2x_vec_overlap)
+ VMOVU (%rsi), %VEC(0)
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+ VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
+ VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(1), VEC_SIZE(%rdi)
+ VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
+ VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+ cmpq $(VEC_SIZE * 4), %rdx
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ jbe.d32 L(return)
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(0)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1)
+ VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(2)
+ VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(3)
+ VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
+ VMOVU %VEC(1), (VEC_SIZE * 3)(%rdi)
+ VMOVU %VEC(2), -(VEC_SIZE * 3)(%rdi,%rdx)
+ VMOVU %VEC(3), -(VEC_SIZE * 4)(%rdi,%rdx)
+ cmpq $(VEC_SIZE * 8), %rdx
+# if VEC_SIZE == 16
+ jbe L(return)
+# else
+ /* Use 8-bit displacement to avoid long nop between
+ instructions. */
+ jbe L(return_disp8)
+# endif
+ leaq (VEC_SIZE * 4)(%rdi), %rcx
+ addq %rdi, %rdx
+ andq $-(VEC_SIZE * 4), %rdx
+ andq $-(VEC_SIZE * 4), %rcx
+ movq %rcx, %r11
+ subq %rdi, %r11
+ addq %r11, %rsi
+ cmpq %rdx, %rcx
+ /* Use 8-bit displacement to avoid long nop between
+ instructions. */
+ je L(return_disp8)
+ movq %rsi, %r10
+ subq %rcx, %r10
+ leaq VEC_SIZE(%r10), %r9
+ leaq (VEC_SIZE * 2)(%r10), %r8
+ leaq (VEC_SIZE * 3)(%r10), %r11
+ .p2align 4
+L(loop):
+ VMOVU (%rcx,%r10), %VEC(0)
+ VMOVU (%rcx,%r9), %VEC(1)
+ VMOVU (%rcx,%r8), %VEC(2)
+ VMOVU (%rcx,%r11), %VEC(3)
+ VMOVA %VEC(0), (%rcx)
+ VMOVA %VEC(1), VEC_SIZE(%rcx)
+ VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx)
+ VMOVA %VEC(3), (VEC_SIZE * 3)(%rcx)
+ addq $(VEC_SIZE * 4), %rcx
+ cmpq %rcx, %rdx
+ jne L(loop)
+L(return_disp8):
+ VZEROUPPER
+ ret
+L(less_vec):
+ /* Less than 1 VEC. */
+# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+# error Unsupported VEC_SIZE!
+# endif
+# if VEC_SIZE > 32
+ cmpb $32, %dl
+ jae L(between_32_63)
+# endif
+# if VEC_SIZE > 16
+ cmpb $16, %dl
+ jae L(between_16_31)
+# endif
+ cmpb $8, %dl
+ jae L(between_8_15)
+ cmpb $4, %dl
+ jae L(between_4_7)
+ cmpb $1, %dl
+ ja L(between_2_3)
+ jb 1f
+ movzbl (%rsi), %ecx
+ movb %cl, (%rdi)
+1:
+ ret
+# if VEC_SIZE > 32
+L(between_32_63):
+ /* From 32 to 63. No branch when size == 32. */
+ vmovdqu (%rsi), %ymm0
+ vmovdqu -32(%rsi,%rdx), %ymm1
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm1, -32(%rdi,%rdx)
+ VZEROUPPER
+ ret
+# endif
+# if VEC_SIZE > 16
+ /* From 16 to 31. No branch when size == 16. */
+L(between_16_31):
+ vmovdqu (%rsi), %xmm0
+ vmovdqu -16(%rsi,%rdx), %xmm1
+ vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmm1, -16(%rdi,%rdx)
+ ret
+# endif
+L(between_8_15):
+ /* From 8 to 15. No branch when size == 8. */
+ movq -8(%rsi,%rdx), %rcx
+ movq (%rsi), %rsi
+ movq %rcx, -8(%rdi,%rdx)
+ movq %rsi, (%rdi)
+ ret
+L(between_4_7):
+ /* From 4 to 7. No branch when size == 4. */
+ movl -4(%rsi,%rdx), %ecx
+ movl (%rsi), %esi
+ movl %ecx, -4(%rdi,%rdx)
+ movl %esi, (%rdi)
+ ret
+L(between_2_3):
+ /* From 2 to 3. No branch when size == 2. */
+ movzwl -2(%rsi,%rdx), %ecx
+ movzwl (%rsi), %esi
+ movw %cx, -2(%rdi,%rdx)
+ movw %si, (%rdi)
+ ret
+
+# if VEC_SIZE > 16
+ /* Align to 16 bytes to avoid long nop between instructions. */
+ .p2align 4
+# endif
+L(more_2x_vec_overlap):
+ /* More than 2 * VEC and there is overlap bewteen destination
+ and source. */
+ cmpq $(VEC_SIZE * 8), %rdx
+ ja L(more_8x_vec)
+ cmpq $(VEC_SIZE * 4), %rdx
+ jb L(last_4x_vec)
+L(between_4x_vec_and_8x_vec):
+ /* Copy from 4 * VEC to 8 * VEC, inclusively. */
+ VMOVU (%rsi), %VEC(0)
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
+ VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
+ VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
+ VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
+ VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(1), VEC_SIZE(%rdi)
+ VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
+ VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
+ VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
+ VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
+ VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
+ VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
+ VZEROUPPER
+ ret
+L(last_4x_vec):
+ /* Copy from 2 * VEC to 4 * VEC. */
+ VMOVU (%rsi), %VEC(0)
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+ VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
+ VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(1), VEC_SIZE(%rdi)
+ VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
+ VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+ VZEROUPPER
+ ret
+L(between_0_and_4x_vec):
+ /* Copy from 0 to 4 * VEC. */
+ cmpl $(VEC_SIZE * 2), %edx
+ jae L(last_4x_vec)
+ /* Copy from 0 to 2 * VEC. */
+ cmpl $VEC_SIZE, %edx
+ jae L(last_2x_vec)
+ /* Copy from 0 to VEC. */
+ VZEROUPPER
+ jmp L(less_vec)
+L(more_8x_vec):
+ cmpq %rsi, %rdi
+ ja L(more_8x_vec_backward)
+
+ .p2align 4
+L(loop_8x_vec_forward):
+ /* Copy 8 * VEC a time forward. */
+ VMOVU (%rsi), %VEC(0)
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
+ VMOVU (VEC_SIZE * 4)(%rsi), %VEC(4)
+ VMOVU (VEC_SIZE * 5)(%rsi), %VEC(5)
+ VMOVU (VEC_SIZE * 6)(%rsi), %VEC(6)
+ VMOVU (VEC_SIZE * 7)(%rsi), %VEC(7)
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(1), VEC_SIZE(%rdi)
+ VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
+ VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
+ VMOVU %VEC(4), (VEC_SIZE * 4)(%rdi)
+ VMOVU %VEC(5), (VEC_SIZE * 5)(%rdi)
+ VMOVU %VEC(6), (VEC_SIZE * 6)(%rdi)
+ VMOVU %VEC(7), (VEC_SIZE * 7)(%rdi)
+ addq $(VEC_SIZE * 8), %rdi
+ addq $(VEC_SIZE * 8), %rsi
+ subq $(VEC_SIZE * 8), %rdx
+ cmpq $(VEC_SIZE * 8), %rdx
+ je L(between_4x_vec_and_8x_vec)
+ ja L(loop_8x_vec_forward)
+ /* Less than 8 * VEC to copy. */
+ cmpq $(VEC_SIZE * 4), %rdx
+ jb L(between_0_and_4x_vec)
+ jmp L(between_4x_vec_and_8x_vec)
+
+ .p2align 4
+L(more_8x_vec_backward):
+ leaq -VEC_SIZE(%rsi, %rdx), %rcx
+ leaq -VEC_SIZE(%rdi, %rdx), %r9
+
+ .p2align 4
+L(loop_8x_vec_backward):
+ /* Copy 8 * VEC a time backward. */
+ VMOVU (%rcx), %VEC(0)
+ VMOVU -VEC_SIZE(%rcx), %VEC(1)
+ VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
+ VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
+ VMOVU -(VEC_SIZE * 4)(%rcx), %VEC(4)
+ VMOVU -(VEC_SIZE * 5)(%rcx), %VEC(5)
+ VMOVU -(VEC_SIZE * 6)(%rcx), %VEC(6)
+ VMOVU -(VEC_SIZE * 7)(%rcx), %VEC(7)
+ VMOVU %VEC(0), (%r9)
+ VMOVU %VEC(1), -VEC_SIZE(%r9)
+ VMOVU %VEC(2), -(VEC_SIZE * 2)(%r9)
+ VMOVU %VEC(3), -(VEC_SIZE * 3)(%r9)
+ VMOVU %VEC(4), -(VEC_SIZE * 4)(%r9)
+ VMOVU %VEC(5), -(VEC_SIZE * 5)(%r9)
+ VMOVU %VEC(6), -(VEC_SIZE * 6)(%r9)
+ VMOVU %VEC(7), -(VEC_SIZE * 7)(%r9)
+ subq $(VEC_SIZE * 8), %rcx
+ subq $(VEC_SIZE * 8), %r9
+ subq $(VEC_SIZE * 8), %rdx
+ cmpq $(VEC_SIZE * 8), %rdx
+ je L(between_4x_vec_and_8x_vec)
+ ja L(loop_8x_vec_backward)
+ /* Less than 8 * VEC to copy. */
+ cmpq $(VEC_SIZE * 4), %rdx
+ jb L(between_0_and_4x_vec)
+ jmp L(between_4x_vec_and_8x_vec)
+END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+
+# ifdef SHARED
+strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
+ MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
+strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
+ MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
+strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_2),
+ MEMMOVE_SYMBOL (__memcpy, unaligned_2))
+strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2),
+ MEMMOVE_SYMBOL (__memcpy_chk, unaligned_2))
+# endif
+
+#endif