aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/x86_64/multiarch
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/x86_64/multiarch')
-rw-r--r--sysdeps/x86_64/multiarch/Makefile42
-rw-r--r--sysdeps/x86_64/multiarch/bcopy.S7
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-impl-list.c460
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-wmemset.h42
-rw-r--r--sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S425
-rw-r--r--sysdeps/x86_64/multiarch/memcmp-sse4.S1776
-rw-r--r--sysdeps/x86_64/multiarch/memcmp-ssse3.S1990
-rw-r--r--sysdeps/x86_64/multiarch/memcmp.S78
-rw-r--r--sysdeps/x86_64/multiarch/memcpy-ssse3-back.S3180
-rw-r--r--sysdeps/x86_64/multiarch/memcpy-ssse3.S3150
-rw-r--r--sysdeps/x86_64/multiarch/memcpy.S75
-rw-r--r--sysdeps/x86_64/multiarch/memcpy_chk.S72
-rw-r--r--sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S12
-rw-r--r--sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S420
-rw-r--r--sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S12
-rw-r--r--sysdeps/x86_64/multiarch/memmove-ssse3-back.S4
-rw-r--r--sysdeps/x86_64/multiarch/memmove-ssse3.S4
-rw-r--r--sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S553
-rw-r--r--sysdeps/x86_64/multiarch/memmove.S101
-rw-r--r--sysdeps/x86_64/multiarch/memmove_chk.S71
-rw-r--r--sysdeps/x86_64/multiarch/mempcpy.S73
-rw-r--r--sysdeps/x86_64/multiarch/mempcpy_chk.S72
-rw-r--r--sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S22
-rw-r--r--sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S194
-rw-r--r--sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S24
-rw-r--r--sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S263
-rw-r--r--sysdeps/x86_64/multiarch/memset.S82
-rw-r--r--sysdeps/x86_64/multiarch/memset_chk.S61
-rw-r--r--sysdeps/x86_64/multiarch/sched_cpucount.c36
-rw-r--r--sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S3
-rw-r--r--sysdeps/x86_64/multiarch/stpcpy-ssse3.S3
-rw-r--r--sysdeps/x86_64/multiarch/stpcpy.S9
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy-c.c8
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S4
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy-ssse3.S4
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy.S8
-rw-r--r--sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S6
-rw-r--r--sysdeps/x86_64/multiarch/strcasecmp_l.S8
-rw-r--r--sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S279
-rw-r--r--sysdeps/x86_64/multiarch/strcat-ssse3.S867
-rw-r--r--sysdeps/x86_64/multiarch/strcat.S85
-rw-r--r--sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S280
-rw-r--r--sysdeps/x86_64/multiarch/strchr.S57
-rw-r--r--sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S213
-rw-r--r--sysdeps/x86_64/multiarch/strcmp-sse42.S1792
-rw-r--r--sysdeps/x86_64/multiarch/strcmp-ssse3.S5
-rw-r--r--sysdeps/x86_64/multiarch/strcmp.S209
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S1889
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-ssse3.S3551
-rw-r--r--sysdeps/x86_64/multiarch/strcpy.S99
-rw-r--r--sysdeps/x86_64/multiarch/strcspn-c.c173
-rw-r--r--sysdeps/x86_64/multiarch/strcspn.S69
-rw-r--r--sysdeps/x86_64/multiarch/strncase_l-ssse3.S6
-rw-r--r--sysdeps/x86_64/multiarch/strncase_l.S8
-rw-r--r--sysdeps/x86_64/multiarch/strncat-c.c8
-rw-r--r--sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S3
-rw-r--r--sysdeps/x86_64/multiarch/strncat-ssse3.S3
-rw-r--r--sysdeps/x86_64/multiarch/strncat.S5
-rw-r--r--sysdeps/x86_64/multiarch/strncmp-ssse3.S6
-rw-r--r--sysdeps/x86_64/multiarch/strncmp.S5
-rw-r--r--sysdeps/x86_64/multiarch/strncpy-c.c8
-rw-r--r--sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S3
-rw-r--r--sysdeps/x86_64/multiarch/strncpy-ssse3.S3
-rw-r--r--sysdeps/x86_64/multiarch/strncpy.S5
-rw-r--r--sysdeps/x86_64/multiarch/strpbrk-c.c8
-rw-r--r--sysdeps/x86_64/multiarch/strpbrk.S5
-rw-r--r--sysdeps/x86_64/multiarch/strspn-c.c145
-rw-r--r--sysdeps/x86_64/multiarch/strspn.S50
-rw-r--r--sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S374
-rw-r--r--sysdeps/x86_64/multiarch/strstr.c50
-rw-r--r--sysdeps/x86_64/multiarch/test-multiarch.c96
-rw-r--r--sysdeps/x86_64/multiarch/varshift.c25
-rw-r--r--sysdeps/x86_64/multiarch/varshift.h30
-rw-r--r--sysdeps/x86_64/multiarch/wcscpy-c.c5
-rw-r--r--sysdeps/x86_64/multiarch/wcscpy-ssse3.S552
-rw-r--r--sysdeps/x86_64/multiarch/wcscpy.S40
-rw-r--r--sysdeps/x86_64/multiarch/wcsnlen-c.c9
-rw-r--r--sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S5
-rw-r--r--sysdeps/x86_64/multiarch/wcsnlen.c45
-rw-r--r--sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe.S4
-rw-r--r--sysdeps/x86_64/multiarch/wmemcmp-c.c9
-rw-r--r--sysdeps/x86_64/multiarch/wmemcmp-sse4.S4
-rw-r--r--sysdeps/x86_64/multiarch/wmemcmp-ssse3.S4
-rw-r--r--sysdeps/x86_64/multiarch/wmemcmp.S55
-rw-r--r--sysdeps/x86_64/multiarch/wmemset.c33
-rw-r--r--sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S21
-rw-r--r--sysdeps/x86_64/multiarch/wmemset_chk.c31
87 files changed, 0 insertions, 24585 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
deleted file mode 100644
index 310a3a4b72..0000000000
--- a/sysdeps/x86_64/multiarch/Makefile
+++ /dev/null
@@ -1,42 +0,0 @@
-ifeq ($(subdir),csu)
-tests += test-multiarch
-endif
-
-ifeq ($(subdir),string)
-
-sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
- strcmp-sse2-unaligned strncmp-ssse3 \
- memcmp-avx2-movbe \
- memcmp-sse4 memcpy-ssse3 \
- memmove-ssse3 \
- memcpy-ssse3-back \
- memmove-ssse3-back \
- memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
- strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
- strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
- strcpy-sse2-unaligned strncpy-sse2-unaligned \
- stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
- strcat-sse2-unaligned strncat-sse2-unaligned \
- strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
- strcspn-c strpbrk-c strspn-c varshift \
- memset-avx512-no-vzeroupper \
- memmove-avx-unaligned-erms \
- memmove-avx512-unaligned-erms \
- memset-avx2-unaligned-erms \
- memset-avx512-unaligned-erms
-CFLAGS-varshift.c += -msse4
-CFLAGS-strcspn-c.c += -msse4
-CFLAGS-strpbrk-c.c += -msse4
-CFLAGS-strspn-c.c += -msse4
-endif
-
-ifeq ($(subdir),wcsmbs)
-sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
- wmemcmp-avx2-movbe \
- wcscpy-ssse3 wcscpy-c \
- wcsnlen-sse4_1 wcsnlen-c
-endif
-
-ifeq ($(subdir),debug)
-sysdep_routines += wmemset_chk-nonshared
-endif
diff --git a/sysdeps/x86_64/multiarch/bcopy.S b/sysdeps/x86_64/multiarch/bcopy.S
deleted file mode 100644
index 639f02bde3..0000000000
--- a/sysdeps/x86_64/multiarch/bcopy.S
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <sysdep.h>
-
- .text
-ENTRY(bcopy)
- xchg %rdi, %rsi
- jmp __libc_memmove /* Branch to IFUNC memmove. */
-END(bcopy)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
deleted file mode 100644
index 5627183aca..0000000000
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ /dev/null
@@ -1,460 +0,0 @@
-/* Enumerate available IFUNC implementations of a function. x86-64 version.
- Copyright (C) 2012-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <assert.h>
-#include <string.h>
-#include <wchar.h>
-#include <ifunc-impl-list.h>
-#include <sysdep.h>
-#include "init-arch.h"
-
-/* Maximum number of IFUNC implementations. */
-#define MAX_IFUNC 5
-
-/* Fill ARRAY of MAX elements with IFUNC implementations for function
- NAME supported on target machine and return the number of valid
- entries. */
-
-size_t
-__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
- size_t max)
-{
- assert (max >= MAX_IFUNC);
-
- size_t i = 0;
-
- /* Support sysdeps/x86_64/multiarch/memcmp.S. */
- IFUNC_IMPL (i, name, memcmp,
- IFUNC_IMPL_ADD (array, i, memcmp,
- (HAS_ARCH_FEATURE (AVX2_Usable)
- && HAS_CPU_FEATURE (MOVBE)),
- __memcmp_avx2_movbe)
- IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_1),
- __memcmp_sse4_1)
- IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3),
- __memcmp_ssse3)
- IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
-
- /* Support sysdeps/x86_64/multiarch/memmove_chk.c. */
- IFUNC_IMPL (i, name, __memmove_chk,
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memmove_chk_avx512_no_vzeroupper)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memmove_chk_avx512_unaligned)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memmove_chk_avx512_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
- HAS_ARCH_FEATURE (AVX_Usable),
- __memmove_chk_avx_unaligned)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
- HAS_ARCH_FEATURE (AVX_Usable),
- __memmove_chk_avx_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
- HAS_CPU_FEATURE (SSSE3),
- __memmove_chk_ssse3_back)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
- HAS_CPU_FEATURE (SSSE3),
- __memmove_chk_ssse3)
- IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
- __memmove_chk_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
- __memmove_chk_sse2_unaligned_erms))
-
- /* Support sysdeps/x86_64/multiarch/memmove.S. */
- IFUNC_IMPL (i, name, memmove,
- IFUNC_IMPL_ADD (array, i, memmove,
- HAS_ARCH_FEATURE (AVX_Usable),
- __memmove_avx_unaligned)
- IFUNC_IMPL_ADD (array, i, memmove,
- HAS_ARCH_FEATURE (AVX_Usable),
- __memmove_avx_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memmove,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memmove_avx512_no_vzeroupper)
- IFUNC_IMPL_ADD (array, i, memmove,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memmove_avx512_unaligned)
- IFUNC_IMPL_ADD (array, i, memmove,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memmove_avx512_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
- __memmove_ssse3_back)
- IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
- __memmove_ssse3)
- IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
- IFUNC_IMPL_ADD (array, i, memmove, 1,
- __memmove_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, memmove, 1,
- __memmove_sse2_unaligned_erms))
-
- /* Support sysdeps/x86_64/multiarch/memset_chk.S. */
- IFUNC_IMPL (i, name, __memset_chk,
- IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
- __memset_chk_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
- __memset_chk_sse2_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memset_chk,
- HAS_ARCH_FEATURE (AVX2_Usable),
- __memset_chk_avx2_unaligned)
- IFUNC_IMPL_ADD (array, i, __memset_chk,
- HAS_ARCH_FEATURE (AVX2_Usable),
- __memset_chk_avx2_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memset_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memset_chk_avx512_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memset_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memset_chk_avx512_unaligned)
- IFUNC_IMPL_ADD (array, i, __memset_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memset_chk_avx512_no_vzeroupper)
- )
-
- /* Support sysdeps/x86_64/multiarch/memset.S. */
- IFUNC_IMPL (i, name, memset,
- IFUNC_IMPL_ADD (array, i, memset, 1,
- __memset_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, memset, 1,
- __memset_sse2_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms)
- IFUNC_IMPL_ADD (array, i, memset,
- HAS_ARCH_FEATURE (AVX2_Usable),
- __memset_avx2_unaligned)
- IFUNC_IMPL_ADD (array, i, memset,
- HAS_ARCH_FEATURE (AVX2_Usable),
- __memset_avx2_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memset,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memset_avx512_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memset,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memset_avx512_unaligned)
- IFUNC_IMPL_ADD (array, i, memset,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memset_avx512_no_vzeroupper)
- )
-
- /* Support sysdeps/x86_64/multiarch/stpncpy.S. */
- IFUNC_IMPL (i, name, stpncpy,
- IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
- __stpncpy_ssse3)
- IFUNC_IMPL_ADD (array, i, stpncpy, 1,
- __stpncpy_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2))
-
- /* Support sysdeps/x86_64/multiarch/stpcpy.S. */
- IFUNC_IMPL (i, name, stpcpy,
- IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3),
- __stpcpy_ssse3)
- IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2))
-
- /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S. */
- IFUNC_IMPL (i, name, strcasecmp,
- IFUNC_IMPL_ADD (array, i, strcasecmp,
- HAS_ARCH_FEATURE (AVX_Usable),
- __strcasecmp_avx)
- IFUNC_IMPL_ADD (array, i, strcasecmp,
- HAS_CPU_FEATURE (SSE4_2),
- __strcasecmp_sse42)
- IFUNC_IMPL_ADD (array, i, strcasecmp,
- HAS_CPU_FEATURE (SSSE3),
- __strcasecmp_ssse3)
- IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
-
- /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S. */
- IFUNC_IMPL (i, name, strcasecmp_l,
- IFUNC_IMPL_ADD (array, i, strcasecmp_l,
- HAS_ARCH_FEATURE (AVX_Usable),
- __strcasecmp_l_avx)
- IFUNC_IMPL_ADD (array, i, strcasecmp_l,
- HAS_CPU_FEATURE (SSE4_2),
- __strcasecmp_l_sse42)
- IFUNC_IMPL_ADD (array, i, strcasecmp_l,
- HAS_CPU_FEATURE (SSSE3),
- __strcasecmp_l_ssse3)
- IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
- __strcasecmp_l_sse2))
-
- /* Support sysdeps/x86_64/multiarch/strcat.S. */
- IFUNC_IMPL (i, name, strcat,
- IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3),
- __strcat_ssse3)
- IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2))
-
- /* Support sysdeps/x86_64/multiarch/strchr.S. */
- IFUNC_IMPL (i, name, strchr,
- IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
- IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
-
- /* Support sysdeps/x86_64/multiarch/strcmp.S. */
- IFUNC_IMPL (i, name, strcmp,
- IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2),
- __strcmp_sse42)
- IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSSE3),
- __strcmp_ssse3)
- IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2))
-
- /* Support sysdeps/x86_64/multiarch/strcpy.S. */
- IFUNC_IMPL (i, name, strcpy,
- IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3),
- __strcpy_ssse3)
- IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
-
- /* Support sysdeps/x86_64/multiarch/strcspn.S. */
- IFUNC_IMPL (i, name, strcspn,
- IFUNC_IMPL_ADD (array, i, strcspn, HAS_CPU_FEATURE (SSE4_2),
- __strcspn_sse42)
- IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_sse2))
-
- /* Support sysdeps/x86_64/multiarch/strncase_l.S. */
- IFUNC_IMPL (i, name, strncasecmp,
- IFUNC_IMPL_ADD (array, i, strncasecmp,
- HAS_ARCH_FEATURE (AVX_Usable),
- __strncasecmp_avx)
- IFUNC_IMPL_ADD (array, i, strncasecmp,
- HAS_CPU_FEATURE (SSE4_2),
- __strncasecmp_sse42)
- IFUNC_IMPL_ADD (array, i, strncasecmp,
- HAS_CPU_FEATURE (SSSE3),
- __strncasecmp_ssse3)
- IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
- __strncasecmp_sse2))
-
- /* Support sysdeps/x86_64/multiarch/strncase_l.S. */
- IFUNC_IMPL (i, name, strncasecmp_l,
- IFUNC_IMPL_ADD (array, i, strncasecmp_l,
- HAS_ARCH_FEATURE (AVX_Usable),
- __strncasecmp_l_avx)
- IFUNC_IMPL_ADD (array, i, strncasecmp_l,
- HAS_CPU_FEATURE (SSE4_2),
- __strncasecmp_l_sse42)
- IFUNC_IMPL_ADD (array, i, strncasecmp_l,
- HAS_CPU_FEATURE (SSSE3),
- __strncasecmp_l_ssse3)
- IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
- __strncasecmp_l_sse2))
-
- /* Support sysdeps/x86_64/multiarch/strncat.S. */
- IFUNC_IMPL (i, name, strncat,
- IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3),
- __strncat_ssse3)
- IFUNC_IMPL_ADD (array, i, strncat, 1,
- __strncat_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2))
-
- /* Support sysdeps/x86_64/multiarch/strncpy.S. */
- IFUNC_IMPL (i, name, strncpy,
- IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3),
- __strncpy_ssse3)
- IFUNC_IMPL_ADD (array, i, strncpy, 1,
- __strncpy_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
-
- /* Support sysdeps/x86_64/multiarch/strpbrk.S. */
- IFUNC_IMPL (i, name, strpbrk,
- IFUNC_IMPL_ADD (array, i, strpbrk, HAS_CPU_FEATURE (SSE4_2),
- __strpbrk_sse42)
- IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_sse2))
-
-
- /* Support sysdeps/x86_64/multiarch/strspn.S. */
- IFUNC_IMPL (i, name, strspn,
- IFUNC_IMPL_ADD (array, i, strspn, HAS_CPU_FEATURE (SSE4_2),
- __strspn_sse42)
- IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_sse2))
-
- /* Support sysdeps/x86_64/multiarch/strstr.c. */
- IFUNC_IMPL (i, name, strstr,
- IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
-
- /* Support sysdeps/x86_64/multiarch/wcscpy.S. */
- IFUNC_IMPL (i, name, wcscpy,
- IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3),
- __wcscpy_ssse3)
- IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_sse2))
-
- /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */
- IFUNC_IMPL (i, name, wcsnlen,
- IFUNC_IMPL_ADD (array, i, wcsnlen,
- HAS_CPU_FEATURE (SSE4_1),
- __wcsnlen_sse4_1)
- IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_sse2))
-
- /* Support sysdeps/x86_64/multiarch/wmemcmp.S. */
- IFUNC_IMPL (i, name, wmemcmp,
- IFUNC_IMPL_ADD (array, i, wmemcmp,
- (HAS_ARCH_FEATURE (AVX2_Usable)
- && HAS_CPU_FEATURE (MOVBE)),
- __wmemcmp_avx2_movbe)
- IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_1),
- __wmemcmp_sse4_1)
- IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3),
- __wmemcmp_ssse3)
- IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
-
- /* Support sysdeps/x86_64/multiarch/wmemset.c. */
- IFUNC_IMPL (i, name, wmemset,
- IFUNC_IMPL_ADD (array, i, wmemset, 1,
- __wmemset_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, wmemset,
- HAS_ARCH_FEATURE (AVX2_Usable),
- __wmemset_avx2_unaligned)
- IFUNC_IMPL_ADD (array, i, wmemset,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __wmemset_avx512_unaligned))
-
-#ifdef SHARED
- /* Support sysdeps/x86_64/multiarch/memcpy_chk.S. */
- IFUNC_IMPL (i, name, __memcpy_chk,
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memcpy_chk_avx512_no_vzeroupper)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memcpy_chk_avx512_unaligned)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memcpy_chk_avx512_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- HAS_ARCH_FEATURE (AVX_Usable),
- __memcpy_chk_avx_unaligned)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- HAS_ARCH_FEATURE (AVX_Usable),
- __memcpy_chk_avx_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- HAS_CPU_FEATURE (SSSE3),
- __memcpy_chk_ssse3_back)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- HAS_CPU_FEATURE (SSSE3),
- __memcpy_chk_ssse3)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
- __memcpy_chk_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
- __memcpy_chk_sse2_unaligned_erms))
-
- /* Support sysdeps/x86_64/multiarch/memcpy.S. */
- IFUNC_IMPL (i, name, memcpy,
- IFUNC_IMPL_ADD (array, i, memcpy,
- HAS_ARCH_FEATURE (AVX_Usable),
- __memcpy_avx_unaligned)
- IFUNC_IMPL_ADD (array, i, memcpy,
- HAS_ARCH_FEATURE (AVX_Usable),
- __memcpy_avx_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
- __memcpy_ssse3_back)
- IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
- __memcpy_ssse3)
- IFUNC_IMPL_ADD (array, i, memcpy,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memcpy_avx512_no_vzeroupper)
- IFUNC_IMPL_ADD (array, i, memcpy,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memcpy_avx512_unaligned)
- IFUNC_IMPL_ADD (array, i, memcpy,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memcpy_avx512_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, memcpy, 1,
- __memcpy_sse2_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms))
-
- /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S. */
- IFUNC_IMPL (i, name, __mempcpy_chk,
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __mempcpy_chk_avx512_no_vzeroupper)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __mempcpy_chk_avx512_unaligned)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __mempcpy_chk_avx512_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- HAS_ARCH_FEATURE (AVX_Usable),
- __mempcpy_chk_avx_unaligned)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- HAS_ARCH_FEATURE (AVX_Usable),
- __mempcpy_chk_avx_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- HAS_CPU_FEATURE (SSSE3),
- __mempcpy_chk_ssse3_back)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- HAS_CPU_FEATURE (SSSE3),
- __mempcpy_chk_ssse3)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
- __mempcpy_chk_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
- __mempcpy_chk_sse2_unaligned_erms))
-
- /* Support sysdeps/x86_64/multiarch/mempcpy.S. */
- IFUNC_IMPL (i, name, mempcpy,
- IFUNC_IMPL_ADD (array, i, mempcpy,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __mempcpy_avx512_no_vzeroupper)
- IFUNC_IMPL_ADD (array, i, mempcpy,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __mempcpy_avx512_unaligned)
- IFUNC_IMPL_ADD (array, i, mempcpy,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __mempcpy_avx512_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, mempcpy,
- HAS_ARCH_FEATURE (AVX_Usable),
- __mempcpy_avx_unaligned)
- IFUNC_IMPL_ADD (array, i, mempcpy,
- HAS_ARCH_FEATURE (AVX_Usable),
- __mempcpy_avx_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
- __mempcpy_ssse3_back)
- IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
- __mempcpy_ssse3)
- IFUNC_IMPL_ADD (array, i, mempcpy, 1,
- __mempcpy_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, mempcpy, 1,
- __mempcpy_sse2_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms))
-
- /* Support sysdeps/x86_64/multiarch/strncmp.S. */
- IFUNC_IMPL (i, name, strncmp,
- IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2),
- __strncmp_sse42)
- IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3),
- __strncmp_ssse3)
- IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
-
- /* Support sysdeps/x86_64/multiarch/wmemset_chk.c. */
- IFUNC_IMPL (i, name, __wmemset_chk,
- IFUNC_IMPL_ADD (array, i, __wmemset_chk, 1,
- __wmemset_chk_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, __wmemset_chk,
- HAS_ARCH_FEATURE (AVX2_Usable),
- __wmemset_chk_avx2_unaligned)
- IFUNC_IMPL_ADD (array, i, __wmemset_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __wmemset_chk_avx512_unaligned))
-#endif
-
- return i;
-}
diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
deleted file mode 100644
index d761985a47..0000000000
--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Common definition for wmemset/wmemset_chk ifunc selections.
- Copyright (C) 2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <init-arch.h>
-
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
-
-static inline void *
-IFUNC_SELECTOR (void)
-{
- const struct cpu_features* cpu_features = __get_cpu_features ();
-
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
- && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
- && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
- {
- if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable)
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
- return OPTIMIZE (avx512_unaligned);
- else
- return OPTIMIZE (avx2_unaligned);
- }
-
- return OPTIMIZE (sse2_unaligned);
-}
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
deleted file mode 100644
index 47630dd97b..0000000000
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ /dev/null
@@ -1,425 +0,0 @@
-/* memcmp/wmemcmp optimized with AVX2.
- Copyright (C) 2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-/* memcmp/wmemcmp is implemented as:
- 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
- to avoid branches.
- 2. Use overlapping compare to avoid branch.
- 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
- bytes for wmemcmp.
- 4. If size is 8 * VEC_SIZE or less, unroll the loop.
- 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
- area.
- 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
- 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
- 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
-
-# include <sysdep.h>
-
-# ifndef MEMCMP
-# define MEMCMP __memcmp_avx2_movbe
-# endif
-
-# ifdef USE_AS_WMEMCMP
-# define VPCMPEQ vpcmpeqd
-# else
-# define VPCMPEQ vpcmpeqb
-# endif
-
-# ifndef VZEROUPPER
-# define VZEROUPPER vzeroupper
-# endif
-
-# define VEC_SIZE 32
-# define VEC_MASK ((1 << VEC_SIZE) - 1)
-
-/* Warning!
- wmemcmp has to use SIGNED comparison for elements.
- memcmp has to use UNSIGNED comparison for elemnts.
-*/
-
- .section .text.avx,"ax",@progbits
-ENTRY (MEMCMP)
-# ifdef USE_AS_WMEMCMP
- shl $2, %rdx
-# endif
- cmpq $VEC_SIZE, %rdx
- jb L(less_vec)
- cmpq $(VEC_SIZE * 2), %rdx
- ja L(more_2x_vec)
-
-L(last_2x_vec):
- /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
- vmovdqu (%rsi), %ymm2
- VPCMPEQ (%rdi), %ymm2, %ymm2
- vpmovmskb %ymm2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
-
-L(last_vec):
- /* Use overlapping loads to avoid branches. */
- leaq -VEC_SIZE(%rdi, %rdx), %rdi
- leaq -VEC_SIZE(%rsi, %rdx), %rsi
- vmovdqu (%rsi), %ymm2
- VPCMPEQ (%rdi), %ymm2, %ymm2
- vpmovmskb %ymm2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
- VZEROUPPER
- ret
-
- .p2align 4
-L(first_vec):
- /* A byte or int32 is different within 16 or 32 bytes. */
- tzcntl %eax, %ecx
-# ifdef USE_AS_WMEMCMP
- xorl %eax, %eax
- movl (%rdi, %rcx), %edx
- cmpl (%rsi, %rcx), %edx
-L(wmemcmp_return):
- setl %al
- negl %eax
- orl $1, %eax
-# else
- movzbl (%rdi, %rcx), %eax
- movzbl (%rsi, %rcx), %edx
- sub %edx, %eax
-# endif
- VZEROUPPER
- ret
-
-# ifdef USE_AS_WMEMCMP
- .p2align 4
-L(4):
- xorl %eax, %eax
- movl (%rdi), %edx
- cmpl (%rsi), %edx
- jne L(wmemcmp_return)
- ret
-# else
- .p2align 4
-L(between_4_7):
- /* Load as big endian with overlapping movbe to avoid branches. */
- movbe (%rdi), %eax
- movbe (%rsi), %ecx
- shlq $32, %rax
- shlq $32, %rcx
- movbe -4(%rdi, %rdx), %edi
- movbe -4(%rsi, %rdx), %esi
- orq %rdi, %rax
- orq %rsi, %rcx
- subq %rcx, %rax
- je L(exit)
- sbbl %eax, %eax
- orl $1, %eax
- ret
-
- .p2align 4
-L(exit):
- ret
-
- .p2align 4
-L(between_2_3):
- /* Load as big endian with overlapping loads and bswap to avoid
- branches. */
- movzwl -2(%rdi, %rdx), %eax
- movzwl -2(%rsi, %rdx), %ecx
- shll $16, %eax
- shll $16, %ecx
- movzwl (%rdi), %edi
- movzwl (%rsi), %esi
- orl %edi, %eax
- orl %esi, %ecx
- bswap %eax
- bswap %ecx
- subl %ecx, %eax
- ret
-
- .p2align 4
-L(1):
- movzbl (%rdi), %eax
- movzbl (%rsi), %ecx
- subl %ecx, %eax
- ret
-# endif
-
- .p2align 4
-L(zero):
- xorl %eax, %eax
- ret
-
- .p2align 4
-L(less_vec):
-# ifdef USE_AS_WMEMCMP
- /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */
- cmpb $4, %dl
- je L(4)
- jb L(zero)
-# else
- cmpb $1, %dl
- je L(1)
- jb L(zero)
- cmpb $4, %dl
- jb L(between_2_3)
- cmpb $8, %dl
- jb L(between_4_7)
-# endif
- cmpb $16, %dl
- jae L(between_16_31)
- /* It is between 8 and 15 bytes. */
- vmovq (%rdi), %xmm1
- vmovq (%rsi), %xmm2
- VPCMPEQ %xmm1, %xmm2, %xmm2
- vpmovmskb %xmm2, %eax
- subl $0xffff, %eax
- jnz L(first_vec)
- /* Use overlapping loads to avoid branches. */
- leaq -8(%rdi, %rdx), %rdi
- leaq -8(%rsi, %rdx), %rsi
- vmovq (%rdi), %xmm1
- vmovq (%rsi), %xmm2
- VPCMPEQ %xmm1, %xmm2, %xmm2
- vpmovmskb %xmm2, %eax
- subl $0xffff, %eax
- jnz L(first_vec)
- ret
-
- .p2align 4
-L(between_16_31):
- /* From 16 to 31 bytes. No branch when size == 16. */
- vmovdqu (%rsi), %xmm2
- VPCMPEQ (%rdi), %xmm2, %xmm2
- vpmovmskb %xmm2, %eax
- subl $0xffff, %eax
- jnz L(first_vec)
-
- /* Use overlapping loads to avoid branches. */
- leaq -16(%rdi, %rdx), %rdi
- leaq -16(%rsi, %rdx), %rsi
- vmovdqu (%rsi), %xmm2
- VPCMPEQ (%rdi), %xmm2, %xmm2
- vpmovmskb %xmm2, %eax
- subl $0xffff, %eax
- jnz L(first_vec)
- ret
-
- .p2align 4
-L(more_2x_vec):
- /* More than 2 * VEC. */
- cmpq $(VEC_SIZE * 8), %rdx
- ja L(more_8x_vec)
- cmpq $(VEC_SIZE * 4), %rdx
- jb L(last_4x_vec)
-
- /* From 4 * VEC to 8 * VEC, inclusively. */
- vmovdqu (%rsi), %ymm1
- VPCMPEQ (%rdi), %ymm1, %ymm1
-
- vmovdqu VEC_SIZE(%rsi), %ymm2
- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
-
- vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
-
- vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
-
- vpand %ymm1, %ymm2, %ymm5
- vpand %ymm3, %ymm4, %ymm6
- vpand %ymm5, %ymm6, %ymm5
-
- vpmovmskb %ymm5, %eax
- subl $VEC_MASK, %eax
- jnz L(4x_vec_end)
-
- leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
- leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
- vmovdqu (%rsi), %ymm1
- VPCMPEQ (%rdi), %ymm1, %ymm1
-
- vmovdqu VEC_SIZE(%rsi), %ymm2
- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
- vpand %ymm2, %ymm1, %ymm5
-
- vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
- vpand %ymm3, %ymm5, %ymm5
-
- vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
- vpand %ymm4, %ymm5, %ymm5
-
- vpmovmskb %ymm5, %eax
- subl $VEC_MASK, %eax
- jnz L(4x_vec_end)
- VZEROUPPER
- ret
-
- .p2align 4
-L(more_8x_vec):
- /* More than 8 * VEC. Check the first VEC. */
- vmovdqu (%rsi), %ymm2
- VPCMPEQ (%rdi), %ymm2, %ymm2
- vpmovmskb %ymm2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
-
- /* Align the first memory area for aligned loads in the loop.
- Compute how much the first memory area is misaligned. */
- movq %rdi, %rcx
- andl $(VEC_SIZE - 1), %ecx
- /* Get the negative of offset for alignment. */
- subq $VEC_SIZE, %rcx
- /* Adjust the second memory area. */
- subq %rcx, %rsi
- /* Adjust the first memory area which should be aligned now. */
- subq %rcx, %rdi
- /* Adjust length. */
- addq %rcx, %rdx
-
-L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- vmovdqu (%rsi), %ymm1
- VPCMPEQ (%rdi), %ymm1, %ymm1
-
- vmovdqu VEC_SIZE(%rsi), %ymm2
- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
- vpand %ymm2, %ymm1, %ymm5
-
- vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
- vpand %ymm3, %ymm5, %ymm5
-
- vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
- vpand %ymm4, %ymm5, %ymm5
-
- vpmovmskb %ymm5, %eax
- subl $VEC_MASK, %eax
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
- addq $(VEC_SIZE * 4), %rsi
-
- subq $(VEC_SIZE * 4), %rdx
- cmpq $(VEC_SIZE * 4), %rdx
- jae L(loop_4x_vec)
-
- /* Less than 4 * VEC. */
- cmpq $VEC_SIZE, %rdx
- jbe L(last_vec)
- cmpq $(VEC_SIZE * 2), %rdx
- jbe L(last_2x_vec)
-
-L(last_4x_vec):
- /* From 2 * VEC to 4 * VEC. */
- vmovdqu (%rsi), %ymm2
- VPCMPEQ (%rdi), %ymm2, %ymm2
- vpmovmskb %ymm2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
-
- addq $VEC_SIZE, %rdi
- addq $VEC_SIZE, %rsi
- vmovdqu (%rsi), %ymm2
- VPCMPEQ (%rdi), %ymm2, %ymm2
- vpmovmskb %ymm2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
-
- /* Use overlapping loads to avoid branches. */
- leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi
- leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi
- vmovdqu (%rsi), %ymm2
- VPCMPEQ (%rdi), %ymm2, %ymm2
- vpmovmskb %ymm2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
-
- addq $VEC_SIZE, %rdi
- addq $VEC_SIZE, %rsi
- vmovdqu (%rsi), %ymm2
- VPCMPEQ (%rdi), %ymm2, %ymm2
- vpmovmskb %ymm2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
- VZEROUPPER
- ret
-
- .p2align 4
-L(4x_vec_end):
- vpmovmskb %ymm1, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
- vpmovmskb %ymm2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec_x1)
- vpmovmskb %ymm3, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec_x2)
- vpmovmskb %ymm4, %eax
- subl $VEC_MASK, %eax
- tzcntl %eax, %ecx
-# ifdef USE_AS_WMEMCMP
- xorl %eax, %eax
- movl (VEC_SIZE * 3)(%rdi, %rcx), %edx
- cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx
- jmp L(wmemcmp_return)
-# else
- movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
- movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx
- sub %edx, %eax
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(first_vec_x1):
- tzcntl %eax, %ecx
-# ifdef USE_AS_WMEMCMP
- xorl %eax, %eax
- movl VEC_SIZE(%rdi, %rcx), %edx
- cmpl VEC_SIZE(%rsi, %rcx), %edx
- jmp L(wmemcmp_return)
-# else
- movzbl VEC_SIZE(%rdi, %rcx), %eax
- movzbl VEC_SIZE(%rsi, %rcx), %edx
- sub %edx, %eax
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(first_vec_x2):
- tzcntl %eax, %ecx
-# ifdef USE_AS_WMEMCMP
- xorl %eax, %eax
- movl (VEC_SIZE * 2)(%rdi, %rcx), %edx
- cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx
- jmp L(wmemcmp_return)
-# else
- movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
- movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx
- sub %edx, %eax
-# endif
- VZEROUPPER
- ret
-END (MEMCMP)
-#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
deleted file mode 100644
index 771639f662..0000000000
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ /dev/null
@@ -1,1776 +0,0 @@
-/* memcmp with SSE4.1, wmemcmp with SSE4.1
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef MEMCMP
-# define MEMCMP __memcmp_sse4_1
-# endif
-
-# define JMPTBL(I, B) (I - B)
-
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- lea TABLE(%rip), %r11; \
- movslq (%r11, INDEX, SCALE), %rcx; \
- add %r11, %rcx; \
- jmp *%rcx; \
- ud2
-
-/* Warning!
- wmemcmp has to use SIGNED comparison for elements.
- memcmp has to use UNSIGNED comparison for elemnts.
-*/
-
- .section .text.sse4.1,"ax",@progbits
-ENTRY (MEMCMP)
-# ifdef USE_AS_WMEMCMP
- shl $2, %rdx
-# endif
- pxor %xmm0, %xmm0
- cmp $79, %rdx
- ja L(79bytesormore)
-# ifndef USE_AS_WMEMCMP
- cmp $1, %rdx
- je L(firstbyte)
-# endif
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-# ifndef USE_AS_WMEMCMP
- .p2align 4
-L(firstbyte):
- movzbl (%rdi), %eax
- movzbl (%rsi), %ecx
- sub %ecx, %eax
- ret
-# endif
-
- .p2align 4
-L(79bytesormore):
- movdqu (%rsi), %xmm1
- movdqu (%rdi), %xmm2
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(16bytesin256)
- mov %rsi, %rcx
- and $-16, %rsi
- add $16, %rsi
- sub %rsi, %rcx
-
- sub %rcx, %rdi
- add %rcx, %rdx
- test $0xf, %rdi
- jz L(2aligned)
-
- cmp $128, %rdx
- ja L(128bytesormore)
-L(less128bytes):
- sub $64, %rdx
-
- movdqu (%rdi), %xmm2
- pxor (%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(16bytesin256)
-
- movdqu 16(%rdi), %xmm2
- pxor 16(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(32bytesin256)
-
- movdqu 32(%rdi), %xmm2
- pxor 32(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(48bytesin256)
-
- movdqu 48(%rdi), %xmm2
- pxor 48(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(64bytesin256)
- cmp $32, %rdx
- jb L(less32bytesin64)
-
- movdqu 64(%rdi), %xmm2
- pxor 64(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(80bytesin256)
-
- movdqu 80(%rdi), %xmm2
- pxor 80(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(96bytesin256)
- sub $32, %rdx
- add $32, %rdi
- add $32, %rsi
-L(less32bytesin64):
- add $64, %rdi
- add $64, %rsi
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-L(128bytesormore):
- cmp $512, %rdx
- ja L(512bytesormore)
- cmp $256, %rdx
- ja L(less512bytes)
-L(less256bytes):
- sub $128, %rdx
-
- movdqu (%rdi), %xmm2
- pxor (%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(16bytesin256)
-
- movdqu 16(%rdi), %xmm2
- pxor 16(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(32bytesin256)
-
- movdqu 32(%rdi), %xmm2
- pxor 32(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(48bytesin256)
-
- movdqu 48(%rdi), %xmm2
- pxor 48(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(64bytesin256)
-
- movdqu 64(%rdi), %xmm2
- pxor 64(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(80bytesin256)
-
- movdqu 80(%rdi), %xmm2
- pxor 80(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(96bytesin256)
-
- movdqu 96(%rdi), %xmm2
- pxor 96(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(112bytesin256)
-
- movdqu 112(%rdi), %xmm2
- pxor 112(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(128bytesin256)
-
- add $128, %rsi
- add $128, %rdi
-
- cmp $64, %rdx
- jae L(less128bytes)
-
- cmp $32, %rdx
- jb L(less32bytesin128)
-
- movdqu (%rdi), %xmm2
- pxor (%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(16bytesin256)
-
- movdqu 16(%rdi), %xmm2
- pxor 16(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(32bytesin256)
- sub $32, %rdx
- add $32, %rdi
- add $32, %rsi
-L(less32bytesin128):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-L(less512bytes):
- sub $256, %rdx
- movdqu (%rdi), %xmm2
- pxor (%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(16bytesin256)
-
- movdqu 16(%rdi), %xmm2
- pxor 16(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(32bytesin256)
-
- movdqu 32(%rdi), %xmm2
- pxor 32(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(48bytesin256)
-
- movdqu 48(%rdi), %xmm2
- pxor 48(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(64bytesin256)
-
- movdqu 64(%rdi), %xmm2
- pxor 64(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(80bytesin256)
-
- movdqu 80(%rdi), %xmm2
- pxor 80(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(96bytesin256)
-
- movdqu 96(%rdi), %xmm2
- pxor 96(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(112bytesin256)
-
- movdqu 112(%rdi), %xmm2
- pxor 112(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(128bytesin256)
-
- movdqu 128(%rdi), %xmm2
- pxor 128(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(144bytesin256)
-
- movdqu 144(%rdi), %xmm2
- pxor 144(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(160bytesin256)
-
- movdqu 160(%rdi), %xmm2
- pxor 160(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(176bytesin256)
-
- movdqu 176(%rdi), %xmm2
- pxor 176(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(192bytesin256)
-
- movdqu 192(%rdi), %xmm2
- pxor 192(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(208bytesin256)
-
- movdqu 208(%rdi), %xmm2
- pxor 208(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(224bytesin256)
-
- movdqu 224(%rdi), %xmm2
- pxor 224(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(240bytesin256)
-
- movdqu 240(%rdi), %xmm2
- pxor 240(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(256bytesin256)
-
- add $256, %rsi
- add $256, %rdi
-
- cmp $128, %rdx
- jae L(less256bytes)
-
- cmp $64, %rdx
- jae L(less128bytes)
-
- cmp $32, %rdx
- jb L(less32bytesin256)
-
- movdqu (%rdi), %xmm2
- pxor (%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(16bytesin256)
-
- movdqu 16(%rdi), %xmm2
- pxor 16(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(32bytesin256)
- sub $32, %rdx
- add $32, %rdi
- add $32, %rsi
-L(less32bytesin256):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
- .p2align 4
-L(512bytesormore):
-# ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %R8_LP
-# else
- mov __x86_data_cache_size_half(%rip), %R8_LP
-# endif
- mov %r8, %r9
- shr $1, %r8
- add %r9, %r8
- cmp %r8, %rdx
- ja L(L2_L3_cache_unaglined)
- sub $64, %rdx
- .p2align 4
-L(64bytesormore_loop):
- movdqu (%rdi), %xmm2
- pxor (%rsi), %xmm2
- movdqa %xmm2, %xmm1
-
- movdqu 16(%rdi), %xmm3
- pxor 16(%rsi), %xmm3
- por %xmm3, %xmm1
-
- movdqu 32(%rdi), %xmm4
- pxor 32(%rsi), %xmm4
- por %xmm4, %xmm1
-
- movdqu 48(%rdi), %xmm5
- pxor 48(%rsi), %xmm5
- por %xmm5, %xmm1
-
- ptest %xmm1, %xmm0
- jnc L(64bytesormore_loop_end)
- add $64, %rsi
- add $64, %rdi
- sub $64, %rdx
- jae L(64bytesormore_loop)
-
- add $64, %rdx
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-L(L2_L3_cache_unaglined):
- sub $64, %rdx
- .p2align 4
-L(L2_L3_unaligned_128bytes_loop):
- prefetchnta 0x1c0(%rdi)
- prefetchnta 0x1c0(%rsi)
- movdqu (%rdi), %xmm2
- pxor (%rsi), %xmm2
- movdqa %xmm2, %xmm1
-
- movdqu 16(%rdi), %xmm3
- pxor 16(%rsi), %xmm3
- por %xmm3, %xmm1
-
- movdqu 32(%rdi), %xmm4
- pxor 32(%rsi), %xmm4
- por %xmm4, %xmm1
-
- movdqu 48(%rdi), %xmm5
- pxor 48(%rsi), %xmm5
- por %xmm5, %xmm1
-
- ptest %xmm1, %xmm0
- jnc L(64bytesormore_loop_end)
- add $64, %rsi
- add $64, %rdi
- sub $64, %rdx
- jae L(L2_L3_unaligned_128bytes_loop)
-
- add $64, %rdx
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-/*
- * This case is for machines which are sensitive for unaligned instructions.
- */
- .p2align 4
-L(2aligned):
- cmp $128, %rdx
- ja L(128bytesormorein2aligned)
-L(less128bytesin2aligned):
- sub $64, %rdx
-
- movdqa (%rdi), %xmm2
- pxor (%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(16bytesin256)
-
- movdqa 16(%rdi), %xmm2
- pxor 16(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(32bytesin256)
-
- movdqa 32(%rdi), %xmm2
- pxor 32(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(48bytesin256)
-
- movdqa 48(%rdi), %xmm2
- pxor 48(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(64bytesin256)
- cmp $32, %rdx
- jb L(less32bytesin64in2alinged)
-
- movdqa 64(%rdi), %xmm2
- pxor 64(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(80bytesin256)
-
- movdqa 80(%rdi), %xmm2
- pxor 80(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(96bytesin256)
- sub $32, %rdx
- add $32, %rdi
- add $32, %rsi
-L(less32bytesin64in2alinged):
- add $64, %rdi
- add $64, %rsi
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
- .p2align 4
-L(128bytesormorein2aligned):
- cmp $512, %rdx
- ja L(512bytesormorein2aligned)
- cmp $256, %rdx
- ja L(256bytesormorein2aligned)
-L(less256bytesin2alinged):
- sub $128, %rdx
-
- movdqa (%rdi), %xmm2
- pxor (%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(16bytesin256)
-
- movdqa 16(%rdi), %xmm2
- pxor 16(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(32bytesin256)
-
- movdqa 32(%rdi), %xmm2
- pxor 32(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(48bytesin256)
-
- movdqa 48(%rdi), %xmm2
- pxor 48(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(64bytesin256)
-
- movdqa 64(%rdi), %xmm2
- pxor 64(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(80bytesin256)
-
- movdqa 80(%rdi), %xmm2
- pxor 80(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(96bytesin256)
-
- movdqa 96(%rdi), %xmm2
- pxor 96(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(112bytesin256)
-
- movdqa 112(%rdi), %xmm2
- pxor 112(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(128bytesin256)
-
- add $128, %rsi
- add $128, %rdi
-
- cmp $64, %rdx
- jae L(less128bytesin2aligned)
-
- cmp $32, %rdx
- jb L(less32bytesin128in2aligned)
-
- movdqu (%rdi), %xmm2
- pxor (%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(16bytesin256)
-
- movdqu 16(%rdi), %xmm2
- pxor 16(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(32bytesin256)
- sub $32, %rdx
- add $32, %rdi
- add $32, %rsi
-L(less32bytesin128in2aligned):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
- .p2align 4
-L(256bytesormorein2aligned):
-
- sub $256, %rdx
- movdqa (%rdi), %xmm2
- pxor (%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(16bytesin256)
-
- movdqa 16(%rdi), %xmm2
- pxor 16(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(32bytesin256)
-
- movdqa 32(%rdi), %xmm2
- pxor 32(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(48bytesin256)
-
- movdqa 48(%rdi), %xmm2
- pxor 48(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(64bytesin256)
-
- movdqa 64(%rdi), %xmm2
- pxor 64(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(80bytesin256)
-
- movdqa 80(%rdi), %xmm2
- pxor 80(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(96bytesin256)
-
- movdqa 96(%rdi), %xmm2
- pxor 96(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(112bytesin256)
-
- movdqa 112(%rdi), %xmm2
- pxor 112(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(128bytesin256)
-
- movdqa 128(%rdi), %xmm2
- pxor 128(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(144bytesin256)
-
- movdqa 144(%rdi), %xmm2
- pxor 144(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(160bytesin256)
-
- movdqa 160(%rdi), %xmm2
- pxor 160(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(176bytesin256)
-
- movdqa 176(%rdi), %xmm2
- pxor 176(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(192bytesin256)
-
- movdqa 192(%rdi), %xmm2
- pxor 192(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(208bytesin256)
-
- movdqa 208(%rdi), %xmm2
- pxor 208(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(224bytesin256)
-
- movdqa 224(%rdi), %xmm2
- pxor 224(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(240bytesin256)
-
- movdqa 240(%rdi), %xmm2
- pxor 240(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(256bytesin256)
-
- add $256, %rsi
- add $256, %rdi
-
- cmp $128, %rdx
- jae L(less256bytesin2alinged)
-
- cmp $64, %rdx
- jae L(less128bytesin2aligned)
-
- cmp $32, %rdx
- jb L(less32bytesin256in2alinged)
-
- movdqa (%rdi), %xmm2
- pxor (%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(16bytesin256)
-
- movdqa 16(%rdi), %xmm2
- pxor 16(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(32bytesin256)
- sub $32, %rdx
- add $32, %rdi
- add $32, %rsi
-L(less32bytesin256in2alinged):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
- .p2align 4
-L(512bytesormorein2aligned):
-# ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %R8_LP
-# else
- mov __x86_data_cache_size_half(%rip), %R8_LP
-# endif
- mov %r8, %r9
- shr $1, %r8
- add %r9, %r8
- cmp %r8, %rdx
- ja L(L2_L3_cache_aglined)
-
- sub $64, %rdx
- .p2align 4
-L(64bytesormore_loopin2aligned):
- movdqa (%rdi), %xmm2
- pxor (%rsi), %xmm2
- movdqa %xmm2, %xmm1
-
- movdqa 16(%rdi), %xmm3
- pxor 16(%rsi), %xmm3
- por %xmm3, %xmm1
-
- movdqa 32(%rdi), %xmm4
- pxor 32(%rsi), %xmm4
- por %xmm4, %xmm1
-
- movdqa 48(%rdi), %xmm5
- pxor 48(%rsi), %xmm5
- por %xmm5, %xmm1
-
- ptest %xmm1, %xmm0
- jnc L(64bytesormore_loop_end)
- add $64, %rsi
- add $64, %rdi
- sub $64, %rdx
- jae L(64bytesormore_loopin2aligned)
-
- add $64, %rdx
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-L(L2_L3_cache_aglined):
- sub $64, %rdx
-
- .p2align 4
-L(L2_L3_aligned_128bytes_loop):
- prefetchnta 0x1c0(%rdi)
- prefetchnta 0x1c0(%rsi)
- movdqa (%rdi), %xmm2
- pxor (%rsi), %xmm2
- movdqa %xmm2, %xmm1
-
- movdqa 16(%rdi), %xmm3
- pxor 16(%rsi), %xmm3
- por %xmm3, %xmm1
-
- movdqa 32(%rdi), %xmm4
- pxor 32(%rsi), %xmm4
- por %xmm4, %xmm1
-
- movdqa 48(%rdi), %xmm5
- pxor 48(%rsi), %xmm5
- por %xmm5, %xmm1
-
- ptest %xmm1, %xmm0
- jnc L(64bytesormore_loop_end)
- add $64, %rsi
- add $64, %rdi
- sub $64, %rdx
- jae L(L2_L3_aligned_128bytes_loop)
-
- add $64, %rdx
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-
- .p2align 4
-L(64bytesormore_loop_end):
- add $16, %rdi
- add $16, %rsi
- ptest %xmm2, %xmm0
- jnc L(16bytes)
-
- add $16, %rdi
- add $16, %rsi
- ptest %xmm3, %xmm0
- jnc L(16bytes)
-
- add $16, %rdi
- add $16, %rsi
- ptest %xmm4, %xmm0
- jnc L(16bytes)
-
- add $16, %rdi
- add $16, %rsi
- jmp L(16bytes)
-
-L(256bytesin256):
- add $256, %rdi
- add $256, %rsi
- jmp L(16bytes)
-L(240bytesin256):
- add $240, %rdi
- add $240, %rsi
- jmp L(16bytes)
-L(224bytesin256):
- add $224, %rdi
- add $224, %rsi
- jmp L(16bytes)
-L(208bytesin256):
- add $208, %rdi
- add $208, %rsi
- jmp L(16bytes)
-L(192bytesin256):
- add $192, %rdi
- add $192, %rsi
- jmp L(16bytes)
-L(176bytesin256):
- add $176, %rdi
- add $176, %rsi
- jmp L(16bytes)
-L(160bytesin256):
- add $160, %rdi
- add $160, %rsi
- jmp L(16bytes)
-L(144bytesin256):
- add $144, %rdi
- add $144, %rsi
- jmp L(16bytes)
-L(128bytesin256):
- add $128, %rdi
- add $128, %rsi
- jmp L(16bytes)
-L(112bytesin256):
- add $112, %rdi
- add $112, %rsi
- jmp L(16bytes)
-L(96bytesin256):
- add $96, %rdi
- add $96, %rsi
- jmp L(16bytes)
-L(80bytesin256):
- add $80, %rdi
- add $80, %rsi
- jmp L(16bytes)
-L(64bytesin256):
- add $64, %rdi
- add $64, %rsi
- jmp L(16bytes)
-L(48bytesin256):
- add $16, %rdi
- add $16, %rsi
-L(32bytesin256):
- add $16, %rdi
- add $16, %rsi
-L(16bytesin256):
- add $16, %rdi
- add $16, %rsi
-L(16bytes):
- mov -16(%rdi), %rax
- mov -16(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
-L(8bytes):
- mov -8(%rdi), %rax
- mov -8(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(12bytes):
- mov -12(%rdi), %rax
- mov -12(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
-L(4bytes):
- mov -4(%rsi), %ecx
-# ifndef USE_AS_WMEMCMP
- mov -4(%rdi), %eax
- cmp %eax, %ecx
-# else
- cmp -4(%rdi), %ecx
-# endif
- jne L(diffin4bytes)
-L(0bytes):
- xor %eax, %eax
- ret
-
-# ifndef USE_AS_WMEMCMP
-/* unreal case for wmemcmp */
- .p2align 4
-L(65bytes):
- movdqu -65(%rdi), %xmm1
- movdqu -65(%rsi), %xmm2
- mov $-65, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(49bytes):
- movdqu -49(%rdi), %xmm1
- movdqu -49(%rsi), %xmm2
- mov $-49, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(33bytes):
- movdqu -33(%rdi), %xmm1
- movdqu -33(%rsi), %xmm2
- mov $-33, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(17bytes):
- mov -17(%rdi), %rax
- mov -17(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
-L(9bytes):
- mov -9(%rdi), %rax
- mov -9(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- movzbl -1(%rdi), %eax
- movzbl -1(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(13bytes):
- mov -13(%rdi), %rax
- mov -13(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- mov -8(%rdi), %rax
- mov -8(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(5bytes):
- mov -5(%rdi), %eax
- mov -5(%rsi), %ecx
- cmp %eax, %ecx
- jne L(diffin4bytes)
- movzbl -1(%rdi), %eax
- movzbl -1(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(66bytes):
- movdqu -66(%rdi), %xmm1
- movdqu -66(%rsi), %xmm2
- mov $-66, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(50bytes):
- movdqu -50(%rdi), %xmm1
- movdqu -50(%rsi), %xmm2
- mov $-50, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(34bytes):
- movdqu -34(%rdi), %xmm1
- movdqu -34(%rsi), %xmm2
- mov $-34, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(18bytes):
- mov -18(%rdi), %rax
- mov -18(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
-L(10bytes):
- mov -10(%rdi), %rax
- mov -10(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- movzwl -2(%rdi), %eax
- movzwl -2(%rsi), %ecx
- cmp %cl, %al
- jne L(end)
- and $0xffff, %eax
- and $0xffff, %ecx
- sub %ecx, %eax
- ret
-
- .p2align 4
-L(14bytes):
- mov -14(%rdi), %rax
- mov -14(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- mov -8(%rdi), %rax
- mov -8(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(6bytes):
- mov -6(%rdi), %eax
- mov -6(%rsi), %ecx
- cmp %eax, %ecx
- jne L(diffin4bytes)
-L(2bytes):
- movzwl -2(%rsi), %ecx
- movzwl -2(%rdi), %eax
- cmp %cl, %al
- jne L(end)
- and $0xffff, %eax
- and $0xffff, %ecx
- sub %ecx, %eax
- ret
-
- .p2align 4
-L(67bytes):
- movdqu -67(%rdi), %xmm2
- movdqu -67(%rsi), %xmm1
- mov $-67, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(51bytes):
- movdqu -51(%rdi), %xmm2
- movdqu -51(%rsi), %xmm1
- mov $-51, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(35bytes):
- movdqu -35(%rsi), %xmm1
- movdqu -35(%rdi), %xmm2
- mov $-35, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(19bytes):
- mov -19(%rdi), %rax
- mov -19(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
-L(11bytes):
- mov -11(%rdi), %rax
- mov -11(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- mov -4(%rdi), %eax
- mov -4(%rsi), %ecx
- cmp %eax, %ecx
- jne L(diffin4bytes)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(15bytes):
- mov -15(%rdi), %rax
- mov -15(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- mov -8(%rdi), %rax
- mov -8(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(7bytes):
- mov -7(%rdi), %eax
- mov -7(%rsi), %ecx
- cmp %eax, %ecx
- jne L(diffin4bytes)
- mov -4(%rdi), %eax
- mov -4(%rsi), %ecx
- cmp %eax, %ecx
- jne L(diffin4bytes)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(3bytes):
- movzwl -3(%rdi), %eax
- movzwl -3(%rsi), %ecx
- cmp %eax, %ecx
- jne L(diffin2bytes)
-L(1bytes):
- movzbl -1(%rdi), %eax
- movzbl -1(%rsi), %ecx
- sub %ecx, %eax
- ret
-# endif
-
- .p2align 4
-L(68bytes):
- movdqu -68(%rdi), %xmm2
- movdqu -68(%rsi), %xmm1
- mov $-68, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(52bytes):
- movdqu -52(%rdi), %xmm2
- movdqu -52(%rsi), %xmm1
- mov $-52, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(36bytes):
- movdqu -36(%rdi), %xmm2
- movdqu -36(%rsi), %xmm1
- mov $-36, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(20bytes):
- movdqu -20(%rdi), %xmm2
- movdqu -20(%rsi), %xmm1
- mov $-20, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
- mov -4(%rsi), %ecx
-
-# ifndef USE_AS_WMEMCMP
- mov -4(%rdi), %eax
- cmp %eax, %ecx
-# else
- cmp -4(%rdi), %ecx
-# endif
- jne L(diffin4bytes)
- xor %eax, %eax
- ret
-
-# ifndef USE_AS_WMEMCMP
-/* unreal cases for wmemcmp */
- .p2align 4
-L(69bytes):
- movdqu -69(%rsi), %xmm1
- movdqu -69(%rdi), %xmm2
- mov $-69, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(53bytes):
- movdqu -53(%rsi), %xmm1
- movdqu -53(%rdi), %xmm2
- mov $-53, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(37bytes):
- movdqu -37(%rsi), %xmm1
- movdqu -37(%rdi), %xmm2
- mov $-37, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(21bytes):
- movdqu -21(%rsi), %xmm1
- movdqu -21(%rdi), %xmm2
- mov $-21, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
- mov -8(%rdi), %rax
- mov -8(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(70bytes):
- movdqu -70(%rsi), %xmm1
- movdqu -70(%rdi), %xmm2
- mov $-70, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(54bytes):
- movdqu -54(%rsi), %xmm1
- movdqu -54(%rdi), %xmm2
- mov $-54, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(38bytes):
- movdqu -38(%rsi), %xmm1
- movdqu -38(%rdi), %xmm2
- mov $-38, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(22bytes):
- movdqu -22(%rsi), %xmm1
- movdqu -22(%rdi), %xmm2
- mov $-22, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
- mov -8(%rdi), %rax
- mov -8(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(71bytes):
- movdqu -71(%rsi), %xmm1
- movdqu -71(%rdi), %xmm2
- mov $-71, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(55bytes):
- movdqu -55(%rdi), %xmm2
- movdqu -55(%rsi), %xmm1
- mov $-55, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(39bytes):
- movdqu -39(%rdi), %xmm2
- movdqu -39(%rsi), %xmm1
- mov $-39, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(23bytes):
- movdqu -23(%rdi), %xmm2
- movdqu -23(%rsi), %xmm1
- mov $-23, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
- mov -8(%rdi), %rax
- mov -8(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- xor %eax, %eax
- ret
-# endif
-
- .p2align 4
-L(72bytes):
- movdqu -72(%rsi), %xmm1
- movdqu -72(%rdi), %xmm2
- mov $-72, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(56bytes):
- movdqu -56(%rdi), %xmm2
- movdqu -56(%rsi), %xmm1
- mov $-56, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(40bytes):
- movdqu -40(%rdi), %xmm2
- movdqu -40(%rsi), %xmm1
- mov $-40, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(24bytes):
- movdqu -24(%rdi), %xmm2
- movdqu -24(%rsi), %xmm1
- mov $-24, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-
- mov -8(%rsi), %rcx
- mov -8(%rdi), %rax
- cmp %rax, %rcx
- jne L(diffin8bytes)
- xor %eax, %eax
- ret
-
-# ifndef USE_AS_WMEMCMP
-/* unreal cases for wmemcmp */
- .p2align 4
-L(73bytes):
- movdqu -73(%rsi), %xmm1
- movdqu -73(%rdi), %xmm2
- mov $-73, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(57bytes):
- movdqu -57(%rdi), %xmm2
- movdqu -57(%rsi), %xmm1
- mov $-57, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(41bytes):
- movdqu -41(%rdi), %xmm2
- movdqu -41(%rsi), %xmm1
- mov $-41, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(25bytes):
- movdqu -25(%rdi), %xmm2
- movdqu -25(%rsi), %xmm1
- mov $-25, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
- mov -9(%rdi), %rax
- mov -9(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- movzbl -1(%rdi), %eax
- movzbl -1(%rsi), %ecx
- sub %ecx, %eax
- ret
-
- .p2align 4
-L(74bytes):
- movdqu -74(%rsi), %xmm1
- movdqu -74(%rdi), %xmm2
- mov $-74, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(58bytes):
- movdqu -58(%rdi), %xmm2
- movdqu -58(%rsi), %xmm1
- mov $-58, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(42bytes):
- movdqu -42(%rdi), %xmm2
- movdqu -42(%rsi), %xmm1
- mov $-42, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(26bytes):
- movdqu -26(%rdi), %xmm2
- movdqu -26(%rsi), %xmm1
- mov $-26, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
- mov -10(%rdi), %rax
- mov -10(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- movzwl -2(%rdi), %eax
- movzwl -2(%rsi), %ecx
- jmp L(diffin2bytes)
-
- .p2align 4
-L(75bytes):
- movdqu -75(%rsi), %xmm1
- movdqu -75(%rdi), %xmm2
- mov $-75, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(59bytes):
- movdqu -59(%rdi), %xmm2
- movdqu -59(%rsi), %xmm1
- mov $-59, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(43bytes):
- movdqu -43(%rdi), %xmm2
- movdqu -43(%rsi), %xmm1
- mov $-43, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(27bytes):
- movdqu -27(%rdi), %xmm2
- movdqu -27(%rsi), %xmm1
- mov $-27, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
- mov -11(%rdi), %rax
- mov -11(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- mov -4(%rdi), %eax
- mov -4(%rsi), %ecx
- cmp %eax, %ecx
- jne L(diffin4bytes)
- xor %eax, %eax
- ret
-# endif
- .p2align 4
-L(76bytes):
- movdqu -76(%rsi), %xmm1
- movdqu -76(%rdi), %xmm2
- mov $-76, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(60bytes):
- movdqu -60(%rdi), %xmm2
- movdqu -60(%rsi), %xmm1
- mov $-60, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(44bytes):
- movdqu -44(%rdi), %xmm2
- movdqu -44(%rsi), %xmm1
- mov $-44, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(28bytes):
- movdqu -28(%rdi), %xmm2
- movdqu -28(%rsi), %xmm1
- mov $-28, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
- mov -12(%rdi), %rax
- mov -12(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- mov -4(%rsi), %ecx
-# ifndef USE_AS_WMEMCMP
- mov -4(%rdi), %eax
- cmp %eax, %ecx
-# else
- cmp -4(%rdi), %ecx
-# endif
- jne L(diffin4bytes)
- xor %eax, %eax
- ret
-
-# ifndef USE_AS_WMEMCMP
-/* unreal cases for wmemcmp */
- .p2align 4
-L(77bytes):
- movdqu -77(%rsi), %xmm1
- movdqu -77(%rdi), %xmm2
- mov $-77, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(61bytes):
- movdqu -61(%rdi), %xmm2
- movdqu -61(%rsi), %xmm1
- mov $-61, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(45bytes):
- movdqu -45(%rdi), %xmm2
- movdqu -45(%rsi), %xmm1
- mov $-45, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(29bytes):
- movdqu -29(%rdi), %xmm2
- movdqu -29(%rsi), %xmm1
- mov $-29, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-
- mov -13(%rdi), %rax
- mov -13(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
-
- mov -8(%rdi), %rax
- mov -8(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(78bytes):
- movdqu -78(%rsi), %xmm1
- movdqu -78(%rdi), %xmm2
- mov $-78, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(62bytes):
- movdqu -62(%rdi), %xmm2
- movdqu -62(%rsi), %xmm1
- mov $-62, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(46bytes):
- movdqu -46(%rdi), %xmm2
- movdqu -46(%rsi), %xmm1
- mov $-46, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(30bytes):
- movdqu -30(%rdi), %xmm2
- movdqu -30(%rsi), %xmm1
- mov $-30, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
- mov -14(%rdi), %rax
- mov -14(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- mov -8(%rdi), %rax
- mov -8(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(79bytes):
- movdqu -79(%rsi), %xmm1
- movdqu -79(%rdi), %xmm2
- mov $-79, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(63bytes):
- movdqu -63(%rdi), %xmm2
- movdqu -63(%rsi), %xmm1
- mov $-63, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(47bytes):
- movdqu -47(%rdi), %xmm2
- movdqu -47(%rsi), %xmm1
- mov $-47, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(31bytes):
- movdqu -31(%rdi), %xmm2
- movdqu -31(%rsi), %xmm1
- mov $-31, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
- mov -15(%rdi), %rax
- mov -15(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- mov -8(%rdi), %rax
- mov -8(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- xor %eax, %eax
- ret
-# endif
- .p2align 4
-L(64bytes):
- movdqu -64(%rdi), %xmm2
- movdqu -64(%rsi), %xmm1
- mov $-64, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(48bytes):
- movdqu -48(%rdi), %xmm2
- movdqu -48(%rsi), %xmm1
- mov $-48, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(32bytes):
- movdqu -32(%rdi), %xmm2
- movdqu -32(%rsi), %xmm1
- mov $-32, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-
- mov -16(%rdi), %rax
- mov -16(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
-
- mov -8(%rdi), %rax
- mov -8(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- xor %eax, %eax
- ret
-
-/*
- * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
- */
- .p2align 3
-L(less16bytes):
- movsbq %dl, %rdx
- mov (%rsi, %rdx), %rcx
- mov (%rdi, %rdx), %rax
- cmp %rax, %rcx
- jne L(diffin8bytes)
- mov 8(%rsi, %rdx), %rcx
- mov 8(%rdi, %rdx), %rax
-L(diffin8bytes):
- cmp %eax, %ecx
- jne L(diffin4bytes)
- shr $32, %rcx
- shr $32, %rax
-
-# ifdef USE_AS_WMEMCMP
-/* for wmemcmp */
- cmp %eax, %ecx
- jne L(diffin4bytes)
- xor %eax, %eax
- ret
-# endif
-
-L(diffin4bytes):
-# ifndef USE_AS_WMEMCMP
- cmp %cx, %ax
- jne L(diffin2bytes)
- shr $16, %ecx
- shr $16, %eax
-L(diffin2bytes):
- cmp %cl, %al
- jne L(end)
- and $0xffff, %eax
- and $0xffff, %ecx
- sub %ecx, %eax
- ret
-
- .p2align 4
-L(end):
- and $0xff, %eax
- and $0xff, %ecx
- sub %ecx, %eax
- ret
-# else
-
-/* for wmemcmp */
- mov $1, %eax
- jl L(nequal_bigger)
- neg %eax
- ret
-
- .p2align 4
-L(nequal_bigger):
- ret
-
-L(unreal_case):
- xor %eax, %eax
- ret
-# endif
-
-END (MEMCMP)
-
- .section .rodata.sse4.1,"a",@progbits
- .p2align 3
-# ifndef USE_AS_WMEMCMP
-L(table_64bytes):
- .int JMPTBL (L(0bytes), L(table_64bytes))
- .int JMPTBL (L(1bytes), L(table_64bytes))
- .int JMPTBL (L(2bytes), L(table_64bytes))
- .int JMPTBL (L(3bytes), L(table_64bytes))
- .int JMPTBL (L(4bytes), L(table_64bytes))
- .int JMPTBL (L(5bytes), L(table_64bytes))
- .int JMPTBL (L(6bytes), L(table_64bytes))
- .int JMPTBL (L(7bytes), L(table_64bytes))
- .int JMPTBL (L(8bytes), L(table_64bytes))
- .int JMPTBL (L(9bytes), L(table_64bytes))
- .int JMPTBL (L(10bytes), L(table_64bytes))
- .int JMPTBL (L(11bytes), L(table_64bytes))
- .int JMPTBL (L(12bytes), L(table_64bytes))
- .int JMPTBL (L(13bytes), L(table_64bytes))
- .int JMPTBL (L(14bytes), L(table_64bytes))
- .int JMPTBL (L(15bytes), L(table_64bytes))
- .int JMPTBL (L(16bytes), L(table_64bytes))
- .int JMPTBL (L(17bytes), L(table_64bytes))
- .int JMPTBL (L(18bytes), L(table_64bytes))
- .int JMPTBL (L(19bytes), L(table_64bytes))
- .int JMPTBL (L(20bytes), L(table_64bytes))
- .int JMPTBL (L(21bytes), L(table_64bytes))
- .int JMPTBL (L(22bytes), L(table_64bytes))
- .int JMPTBL (L(23bytes), L(table_64bytes))
- .int JMPTBL (L(24bytes), L(table_64bytes))
- .int JMPTBL (L(25bytes), L(table_64bytes))
- .int JMPTBL (L(26bytes), L(table_64bytes))
- .int JMPTBL (L(27bytes), L(table_64bytes))
- .int JMPTBL (L(28bytes), L(table_64bytes))
- .int JMPTBL (L(29bytes), L(table_64bytes))
- .int JMPTBL (L(30bytes), L(table_64bytes))
- .int JMPTBL (L(31bytes), L(table_64bytes))
- .int JMPTBL (L(32bytes), L(table_64bytes))
- .int JMPTBL (L(33bytes), L(table_64bytes))
- .int JMPTBL (L(34bytes), L(table_64bytes))
- .int JMPTBL (L(35bytes), L(table_64bytes))
- .int JMPTBL (L(36bytes), L(table_64bytes))
- .int JMPTBL (L(37bytes), L(table_64bytes))
- .int JMPTBL (L(38bytes), L(table_64bytes))
- .int JMPTBL (L(39bytes), L(table_64bytes))
- .int JMPTBL (L(40bytes), L(table_64bytes))
- .int JMPTBL (L(41bytes), L(table_64bytes))
- .int JMPTBL (L(42bytes), L(table_64bytes))
- .int JMPTBL (L(43bytes), L(table_64bytes))
- .int JMPTBL (L(44bytes), L(table_64bytes))
- .int JMPTBL (L(45bytes), L(table_64bytes))
- .int JMPTBL (L(46bytes), L(table_64bytes))
- .int JMPTBL (L(47bytes), L(table_64bytes))
- .int JMPTBL (L(48bytes), L(table_64bytes))
- .int JMPTBL (L(49bytes), L(table_64bytes))
- .int JMPTBL (L(50bytes), L(table_64bytes))
- .int JMPTBL (L(51bytes), L(table_64bytes))
- .int JMPTBL (L(52bytes), L(table_64bytes))
- .int JMPTBL (L(53bytes), L(table_64bytes))
- .int JMPTBL (L(54bytes), L(table_64bytes))
- .int JMPTBL (L(55bytes), L(table_64bytes))
- .int JMPTBL (L(56bytes), L(table_64bytes))
- .int JMPTBL (L(57bytes), L(table_64bytes))
- .int JMPTBL (L(58bytes), L(table_64bytes))
- .int JMPTBL (L(59bytes), L(table_64bytes))
- .int JMPTBL (L(60bytes), L(table_64bytes))
- .int JMPTBL (L(61bytes), L(table_64bytes))
- .int JMPTBL (L(62bytes), L(table_64bytes))
- .int JMPTBL (L(63bytes), L(table_64bytes))
- .int JMPTBL (L(64bytes), L(table_64bytes))
- .int JMPTBL (L(65bytes), L(table_64bytes))
- .int JMPTBL (L(66bytes), L(table_64bytes))
- .int JMPTBL (L(67bytes), L(table_64bytes))
- .int JMPTBL (L(68bytes), L(table_64bytes))
- .int JMPTBL (L(69bytes), L(table_64bytes))
- .int JMPTBL (L(70bytes), L(table_64bytes))
- .int JMPTBL (L(71bytes), L(table_64bytes))
- .int JMPTBL (L(72bytes), L(table_64bytes))
- .int JMPTBL (L(73bytes), L(table_64bytes))
- .int JMPTBL (L(74bytes), L(table_64bytes))
- .int JMPTBL (L(75bytes), L(table_64bytes))
- .int JMPTBL (L(76bytes), L(table_64bytes))
- .int JMPTBL (L(77bytes), L(table_64bytes))
- .int JMPTBL (L(78bytes), L(table_64bytes))
- .int JMPTBL (L(79bytes), L(table_64bytes))
-# else
-L(table_64bytes):
- .int JMPTBL (L(0bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(4bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(8bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(12bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(16bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(20bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(24bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(28bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(32bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(36bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(40bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(44bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(48bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(52bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(56bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(60bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(64bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(68bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(72bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(76bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
deleted file mode 100644
index 8d7d2fe67b..0000000000
--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+++ /dev/null
@@ -1,1990 +0,0 @@
-/* memcmp with SSSE3, wmemcmp with SSSE3
- Copyright (C) 2011-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef MEMCMP
-# define MEMCMP __memcmp_ssse3
-# endif
-
-/* Warning!
- wmemcmp has to use SIGNED comparison for elements.
- memcmp has to use UNSIGNED comparison for elemnts.
-*/
-
- atom_text_section
-ENTRY (MEMCMP)
-# ifdef USE_AS_WMEMCMP
- shl $2, %rdx
- test %rdx, %rdx
- jz L(equal)
-# endif
- mov %rdx, %rcx
- mov %rdi, %rdx
- cmp $48, %rcx;
- jae L(48bytesormore) /* LEN => 48 */
-
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-/* ECX >= 32. */
-L(48bytesormore):
- movdqu (%rdi), %xmm3
- movdqu (%rsi), %xmm0
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %edx
- lea 16(%rdi), %rdi
- lea 16(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(less16bytes)
- mov %edi, %edx
- and $0xf, %edx
- xor %rdx, %rdi
- sub %rdx, %rsi
- add %rdx, %rcx
- mov %esi, %edx
- and $0xf, %edx
- jz L(shr_0)
- xor %rdx, %rsi
-
-# ifndef USE_AS_WMEMCMP
- cmp $8, %edx
- jae L(next_unaligned_table)
- cmp $0, %edx
- je L(shr_0)
- cmp $1, %edx
- je L(shr_1)
- cmp $2, %edx
- je L(shr_2)
- cmp $3, %edx
- je L(shr_3)
- cmp $4, %edx
- je L(shr_4)
- cmp $5, %edx
- je L(shr_5)
- cmp $6, %edx
- je L(shr_6)
- jmp L(shr_7)
-
- .p2align 2
-L(next_unaligned_table):
- cmp $8, %edx
- je L(shr_8)
- cmp $9, %edx
- je L(shr_9)
- cmp $10, %edx
- je L(shr_10)
- cmp $11, %edx
- je L(shr_11)
- cmp $12, %edx
- je L(shr_12)
- cmp $13, %edx
- je L(shr_13)
- cmp $14, %edx
- je L(shr_14)
- jmp L(shr_15)
-# else
- cmp $0, %edx
- je L(shr_0)
- cmp $4, %edx
- je L(shr_4)
- cmp $8, %edx
- je L(shr_8)
- jmp L(shr_12)
-# endif
-
- .p2align 4
-L(shr_0):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- jae L(shr_0_gobble)
- xor %eax, %eax
- movdqa (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
- movdqa 16(%rsi), %xmm2
- pcmpeqb 16(%rdi), %xmm2
- pand %xmm1, %xmm2
- pmovmskb %xmm2, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_0_gobble):
- movdqa (%rsi), %xmm0
- xor %eax, %eax
- pcmpeqb (%rdi), %xmm0
- sub $32, %rcx
- movdqa 16(%rsi), %xmm2
- pcmpeqb 16(%rdi), %xmm2
-L(shr_0_gobble_loop):
- pand %xmm0, %xmm2
- sub $32, %rcx
- pmovmskb %xmm2, %edx
- movdqa %xmm0, %xmm1
- movdqa 32(%rsi), %xmm0
- movdqa 48(%rsi), %xmm2
- sbb $0xffff, %edx
- pcmpeqb 32(%rdi), %xmm0
- pcmpeqb 48(%rdi), %xmm2
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- jz L(shr_0_gobble_loop)
-
- pand %xmm0, %xmm2
- cmp $0, %rcx
- jge L(next)
- inc %edx
- add $32, %rcx
-L(next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm2, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_1):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_1_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $1, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $1, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $1, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_1_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $1, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $1, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_1_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $1, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $1, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_1_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_1_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_1_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 1(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-
- .p2align 4
-L(shr_2):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_2_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $2, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $2, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $2, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_2_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $2, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $2, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_2_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $2, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $2, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_2_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_2_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_2_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 2(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_3):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_3_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $3, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $3, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $3, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_3_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $3, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $3, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_3_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $3, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $3, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_3_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_3_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_3_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 3(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# endif
-
- .p2align 4
-L(shr_4):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_4_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $4, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $4, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $4, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_4_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $4, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $4, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_4_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $4, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $4, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_4_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_4_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_4_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 4(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_5):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_5_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $5, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $5, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $5, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_5_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $5, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $5, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_5_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $5, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $5, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_5_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_5_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_5_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 5(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_6):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_6_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $6, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $6, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $6, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_6_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $6, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $6, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_6_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $6, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $6, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_6_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_6_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_6_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 6(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_7):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_7_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $7, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $7, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $7, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_7_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $7, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $7, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_7_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $7, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $7, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_7_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_7_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_7_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 7(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# endif
-
- .p2align 4
-L(shr_8):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_8_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $8, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $8, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $8, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_8_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $8, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $8, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_8_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $8, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $8, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_8_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_8_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_8_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 8(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_9):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_9_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $9, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $9, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $9, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_9_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $9, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $9, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_9_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $9, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $9, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_9_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_9_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_9_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 9(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_10):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_10_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $10, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $10, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $10, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_10_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $10, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $10, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_10_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $10, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $10, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_10_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_10_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_10_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 10(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_11):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_11_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $11, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $11, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $11, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_11_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $11, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $11, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_11_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $11, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $11, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_11_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_11_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_11_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 11(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# endif
-
- .p2align 4
-L(shr_12):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_12_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $12, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $12, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $12, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_12_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $12, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $12, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_12_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $12, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $12, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_12_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_12_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_12_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 12(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_13):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_13_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $13, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $13, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $13, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_13_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $13, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $13, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_13_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $13, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $13, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_13_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_13_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_13_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 13(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_14):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_14_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $14, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $14, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $14, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_14_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $14, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $14, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_14_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $14, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $14, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_14_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_14_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_14_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 14(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_15):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_15_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $15, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $15, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $15, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_15_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $15, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $15, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_15_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $15, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $15, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_15_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_15_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_15_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 15(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-# endif
- .p2align 4
-L(exit):
- pmovmskb %xmm1, %r8d
- sub $0xffff, %r8d
- jz L(first16bytes)
- lea -16(%rsi), %rsi
- lea -16(%rdi), %rdi
- mov %r8d, %edx
-L(first16bytes):
- add %rax, %rsi
-L(less16bytes):
-# ifndef USE_AS_WMEMCMP
- test %dl, %dl
- jz L(next_24_bytes)
-
- test $0x01, %dl
- jnz L(Byte16)
-
- test $0x02, %dl
- jnz L(Byte17)
-
- test $0x04, %dl
- jnz L(Byte18)
-
- test $0x08, %dl
- jnz L(Byte19)
-
- test $0x10, %dl
- jnz L(Byte20)
-
- test $0x20, %dl
- jnz L(Byte21)
-
- test $0x40, %dl
- jnz L(Byte22)
-
- movzbl -9(%rdi), %eax
- movzbl -9(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte16):
- movzbl -16(%rdi), %eax
- movzbl -16(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte17):
- movzbl -15(%rdi), %eax
- movzbl -15(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte18):
- movzbl -14(%rdi), %eax
- movzbl -14(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte19):
- movzbl -13(%rdi), %eax
- movzbl -13(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte20):
- movzbl -12(%rdi), %eax
- movzbl -12(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte21):
- movzbl -11(%rdi), %eax
- movzbl -11(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte22):
- movzbl -10(%rdi), %eax
- movzbl -10(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(next_24_bytes):
- lea 8(%rdi), %rdi
- lea 8(%rsi), %rsi
- test $0x01, %dh
- jnz L(Byte16)
-
- test $0x02, %dh
- jnz L(Byte17)
-
- test $0x04, %dh
- jnz L(Byte18)
-
- test $0x08, %dh
- jnz L(Byte19)
-
- test $0x10, %dh
- jnz L(Byte20)
-
- test $0x20, %dh
- jnz L(Byte21)
-
- test $0x40, %dh
- jnz L(Byte22)
-
- movzbl -9(%rdi), %eax
- movzbl -9(%rsi), %edx
- sub %edx, %eax
- ret
-# else
-/* special for wmemcmp */
- xor %eax, %eax
- test %dl, %dl
- jz L(next_two_double_words)
- and $15, %dl
- jz L(second_double_word)
- mov -16(%rdi), %eax
- cmp -16(%rsi), %eax
- jne L(find_diff)
- ret
-
- .p2align 4
-L(second_double_word):
- mov -12(%rdi), %eax
- cmp -12(%rsi), %eax
- jne L(find_diff)
- ret
-
- .p2align 4
-L(next_two_double_words):
- and $15, %dh
- jz L(fourth_double_word)
- mov -8(%rdi), %eax
- cmp -8(%rsi), %eax
- jne L(find_diff)
- ret
-
- .p2align 4
-L(fourth_double_word):
- mov -4(%rdi), %eax
- cmp -4(%rsi), %eax
- jne L(find_diff)
- ret
-# endif
-
- .p2align 4
-L(less48bytes):
- cmp $8, %ecx
- jae L(more8bytes)
- cmp $0, %ecx
- je L(0bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $1, %ecx
- je L(1bytes)
- cmp $2, %ecx
- je L(2bytes)
- cmp $3, %ecx
- je L(3bytes)
- cmp $4, %ecx
- je L(4bytes)
- cmp $5, %ecx
- je L(5bytes)
- cmp $6, %ecx
- je L(6bytes)
- jmp L(7bytes)
-# else
- jmp L(4bytes)
-# endif
-
- .p2align 4
-L(more8bytes):
- cmp $16, %ecx
- jae L(more16bytes)
- cmp $8, %ecx
- je L(8bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $9, %ecx
- je L(9bytes)
- cmp $10, %ecx
- je L(10bytes)
- cmp $11, %ecx
- je L(11bytes)
- cmp $12, %ecx
- je L(12bytes)
- cmp $13, %ecx
- je L(13bytes)
- cmp $14, %ecx
- je L(14bytes)
- jmp L(15bytes)
-# else
- jmp L(12bytes)
-# endif
-
- .p2align 4
-L(more16bytes):
- cmp $24, %ecx
- jae L(more24bytes)
- cmp $16, %ecx
- je L(16bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $17, %ecx
- je L(17bytes)
- cmp $18, %ecx
- je L(18bytes)
- cmp $19, %ecx
- je L(19bytes)
- cmp $20, %ecx
- je L(20bytes)
- cmp $21, %ecx
- je L(21bytes)
- cmp $22, %ecx
- je L(22bytes)
- jmp L(23bytes)
-# else
- jmp L(20bytes)
-# endif
-
- .p2align 4
-L(more24bytes):
- cmp $32, %ecx
- jae L(more32bytes)
- cmp $24, %ecx
- je L(24bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $25, %ecx
- je L(25bytes)
- cmp $26, %ecx
- je L(26bytes)
- cmp $27, %ecx
- je L(27bytes)
- cmp $28, %ecx
- je L(28bytes)
- cmp $29, %ecx
- je L(29bytes)
- cmp $30, %ecx
- je L(30bytes)
- jmp L(31bytes)
-# else
- jmp L(28bytes)
-# endif
-
- .p2align 4
-L(more32bytes):
- cmp $40, %ecx
- jae L(more40bytes)
- cmp $32, %ecx
- je L(32bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $33, %ecx
- je L(33bytes)
- cmp $34, %ecx
- je L(34bytes)
- cmp $35, %ecx
- je L(35bytes)
- cmp $36, %ecx
- je L(36bytes)
- cmp $37, %ecx
- je L(37bytes)
- cmp $38, %ecx
- je L(38bytes)
- jmp L(39bytes)
-# else
- jmp L(36bytes)
-# endif
-
- .p2align 4
-L(more40bytes):
- cmp $40, %ecx
- je L(40bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $41, %ecx
- je L(41bytes)
- cmp $42, %ecx
- je L(42bytes)
- cmp $43, %ecx
- je L(43bytes)
- cmp $44, %ecx
- je L(44bytes)
- cmp $45, %ecx
- je L(45bytes)
- cmp $46, %ecx
- je L(46bytes)
- jmp L(47bytes)
-
- .p2align 4
-L(44bytes):
- movl -44(%rdi), %eax
- movl -44(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(40bytes):
- movl -40(%rdi), %eax
- movl -40(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(36bytes):
- movl -36(%rdi), %eax
- movl -36(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(32bytes):
- movl -32(%rdi), %eax
- movl -32(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(28bytes):
- movl -28(%rdi), %eax
- movl -28(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(24bytes):
- movl -24(%rdi), %eax
- movl -24(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(20bytes):
- movl -20(%rdi), %eax
- movl -20(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(16bytes):
- movl -16(%rdi), %eax
- movl -16(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(12bytes):
- movl -12(%rdi), %eax
- movl -12(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(8bytes):
- movl -8(%rdi), %eax
- movl -8(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(4bytes):
- movl -4(%rdi), %eax
- movl -4(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(0bytes):
- xor %eax, %eax
- ret
-# else
- .p2align 4
-L(44bytes):
- movl -44(%rdi), %eax
- cmp -44(%rsi), %eax
- jne L(find_diff)
-L(40bytes):
- movl -40(%rdi), %eax
- cmp -40(%rsi), %eax
- jne L(find_diff)
-L(36bytes):
- movl -36(%rdi), %eax
- cmp -36(%rsi), %eax
- jne L(find_diff)
-L(32bytes):
- movl -32(%rdi), %eax
- cmp -32(%rsi), %eax
- jne L(find_diff)
-L(28bytes):
- movl -28(%rdi), %eax
- cmp -28(%rsi), %eax
- jne L(find_diff)
-L(24bytes):
- movl -24(%rdi), %eax
- cmp -24(%rsi), %eax
- jne L(find_diff)
-L(20bytes):
- movl -20(%rdi), %eax
- cmp -20(%rsi), %eax
- jne L(find_diff)
-L(16bytes):
- movl -16(%rdi), %eax
- cmp -16(%rsi), %eax
- jne L(find_diff)
-L(12bytes):
- movl -12(%rdi), %eax
- cmp -12(%rsi), %eax
- jne L(find_diff)
-L(8bytes):
- movl -8(%rdi), %eax
- cmp -8(%rsi), %eax
- jne L(find_diff)
-L(4bytes):
- movl -4(%rdi), %eax
- cmp -4(%rsi), %eax
- jne L(find_diff)
-L(0bytes):
- xor %eax, %eax
- ret
-# endif
-
-# ifndef USE_AS_WMEMCMP
- .p2align 4
-L(45bytes):
- movl -45(%rdi), %eax
- movl -45(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(41bytes):
- movl -41(%rdi), %eax
- movl -41(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(37bytes):
- movl -37(%rdi), %eax
- movl -37(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(33bytes):
- movl -33(%rdi), %eax
- movl -33(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(29bytes):
- movl -29(%rdi), %eax
- movl -29(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(25bytes):
- movl -25(%rdi), %eax
- movl -25(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(21bytes):
- movl -21(%rdi), %eax
- movl -21(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(17bytes):
- movl -17(%rdi), %eax
- movl -17(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(13bytes):
- movl -13(%rdi), %eax
- movl -13(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(9bytes):
- movl -9(%rdi), %eax
- movl -9(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(5bytes):
- movl -5(%rdi), %eax
- movl -5(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(1bytes):
- movzbl -1(%rdi), %eax
- cmpb -1(%rsi), %al
- jne L(set)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(46bytes):
- movl -46(%rdi), %eax
- movl -46(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(42bytes):
- movl -42(%rdi), %eax
- movl -42(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(38bytes):
- movl -38(%rdi), %eax
- movl -38(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(34bytes):
- movl -34(%rdi), %eax
- movl -34(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(30bytes):
- movl -30(%rdi), %eax
- movl -30(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(26bytes):
- movl -26(%rdi), %eax
- movl -26(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(22bytes):
- movl -22(%rdi), %eax
- movl -22(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(18bytes):
- movl -18(%rdi), %eax
- movl -18(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(14bytes):
- movl -14(%rdi), %eax
- movl -14(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(10bytes):
- movl -10(%rdi), %eax
- movl -10(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(6bytes):
- movl -6(%rdi), %eax
- movl -6(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(2bytes):
- movzwl -2(%rdi), %eax
- movzwl -2(%rsi), %ecx
- cmpb %cl, %al
- jne L(set)
- cmp %ecx, %eax
- jne L(set)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(47bytes):
- movl -47(%rdi), %eax
- movl -47(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(43bytes):
- movl -43(%rdi), %eax
- movl -43(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(39bytes):
- movl -39(%rdi), %eax
- movl -39(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(35bytes):
- movl -35(%rdi), %eax
- movl -35(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(31bytes):
- movl -31(%rdi), %eax
- movl -31(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(27bytes):
- movl -27(%rdi), %eax
- movl -27(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(23bytes):
- movl -23(%rdi), %eax
- movl -23(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(19bytes):
- movl -19(%rdi), %eax
- movl -19(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(15bytes):
- movl -15(%rdi), %eax
- movl -15(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(11bytes):
- movl -11(%rdi), %eax
- movl -11(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(7bytes):
- movl -7(%rdi), %eax
- movl -7(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(3bytes):
- movzwl -3(%rdi), %eax
- movzwl -3(%rsi), %ecx
- cmpb %cl, %al
- jne L(set)
- cmp %ecx, %eax
- jne L(set)
- movzbl -1(%rdi), %eax
- cmpb -1(%rsi), %al
- jne L(set)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(find_diff):
- cmpb %cl, %al
- jne L(set)
- cmpw %cx, %ax
- jne L(set)
- shr $16, %eax
- shr $16, %ecx
- cmpb %cl, %al
- jne L(set)
-
-/* We get there only if we already know there is a
-difference. */
-
- cmp %ecx, %eax
-L(set):
- sbb %eax, %eax
- sbb $-1, %eax
- ret
-# else
-
-/* for wmemcmp */
- .p2align 4
-L(find_diff):
- mov $1, %eax
- jg L(find_diff_bigger)
- neg %eax
- ret
-
- .p2align 4
-L(find_diff_bigger):
- ret
-# endif
-
- .p2align 4
-L(equal):
- xor %eax, %eax
- ret
-
-END (MEMCMP)
-#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S
deleted file mode 100644
index 0c9804b7e9..0000000000
--- a/sysdeps/x86_64/multiarch/memcmp.S
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Multiple versions of memcmp
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in libc. */
-#if IS_IN (libc)
- .text
-ENTRY(memcmp)
- .type memcmp, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
- jnz 1f
- HAS_ARCH_FEATURE (AVX2_Usable)
- jz 1f
- HAS_CPU_FEATURE (MOVBE)
- jz 1f
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz 1f
- leaq __memcmp_avx2_movbe(%rip), %rax
- ret
-
-1: HAS_CPU_FEATURE (SSSE3)
- jnz 2f
- leaq __memcmp_sse2(%rip), %rax
- ret
-
-2: HAS_CPU_FEATURE (SSE4_1)
- jz 3f
- leaq __memcmp_sse4_1(%rip), %rax
- ret
-
-3: leaq __memcmp_ssse3(%rip), %rax
- ret
-
-END(memcmp)
-
-# undef ENTRY
-# define ENTRY(name) \
- .type __memcmp_sse2, @function; \
- .p2align 4; \
- .globl __memcmp_sse2; \
- .hidden __memcmp_sse2; \
- __memcmp_sse2: cfi_startproc; \
- CALL_MCOUNT
-# undef END
-# define END(name) \
- cfi_endproc; .size __memcmp_sse2, .-__memcmp_sse2
-
-# ifdef SHARED
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal memcmp calls through a PLT.
- The speedup we get from using SSE4 instructions is likely eaten away
- by the indirect call in the PLT. */
-# define libc_hidden_builtin_def(name) \
- .globl __GI_memcmp; __GI_memcmp = __memcmp_sse2
-# endif
-#endif
-
-#include "../memcmp.S"
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
deleted file mode 100644
index 4e060a27fd..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ /dev/null
@@ -1,3180 +0,0 @@
-/* memcpy with SSSE3 and REP string
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#if IS_IN (libc) \
- && (defined SHARED \
- || defined USE_AS_MEMMOVE \
- || !defined USE_MULTIARCH)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY __memcpy_ssse3_back
-# define MEMCPY_CHK __memcpy_chk_ssse3_back
-# define MEMPCPY __mempcpy_ssse3_back
-# define MEMPCPY_CHK __mempcpy_chk_ssse3_back
-#endif
-
-#define JMPTBL(I, B) I - B
-
-/* Branch to an entry in a jump table. TABLE is a jump table with
- relative offsets. INDEX is a register contains the index into the
- jump table. SCALE is the scale of INDEX. */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- lea TABLE(%rip), %r11; \
- movslq (%r11, INDEX, SCALE), INDEX; \
- lea (%r11, INDEX), INDEX; \
- jmp *INDEX; \
- ud2
-
- .section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
- movq %rdi, %rax
- addq %rdx, %rax
- jmp L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
- mov %rdi, %rax
-#ifdef USE_AS_MEMPCPY
- add %rdx, %rax
-#endif
-
-#ifdef USE_AS_MEMMOVE
- cmp %rsi, %rdi
- jb L(copy_forward)
- je L(bwd_write_0bytes)
- cmp $144, %rdx
- jae L(copy_backward)
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-L(copy_forward):
-#endif
-L(start):
- cmp $144, %rdx
- jae L(144bytesormore)
-
-L(fwd_write_less32bytes):
-#ifndef USE_AS_MEMMOVE
- cmp %dil, %sil
- jbe L(bk_write)
-#endif
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-#ifndef USE_AS_MEMMOVE
-L(bk_write):
-
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-#endif
-
- .p2align 4
-L(144bytesormore):
-
-#ifndef USE_AS_MEMMOVE
- cmp %dil, %sil
- jle L(copy_backward)
-#endif
- movdqu (%rsi), %xmm0
- mov %rdi, %r8
- and $-16, %rdi
- add $16, %rdi
- mov %rdi, %r9
- sub %r8, %r9
- sub %r9, %rdx
- add %r9, %rsi
- mov %rsi, %r9
- and $0xf, %r9
- jz L(shl_0)
-#ifdef DATA_CACHE_SIZE
- mov $DATA_CACHE_SIZE, %RCX_LP
-#else
- mov __x86_data_cache_size(%rip), %RCX_LP
-#endif
- cmp %rcx, %rdx
- jae L(gobble_mem_fwd)
- lea L(shl_table_fwd)(%rip), %r11
- sub $0x80, %rdx
- movslq (%r11, %r9, 4), %r9
- add %r11, %r9
- jmp *%r9
- ud2
-
- .p2align 4
-L(copy_backward):
-#ifdef DATA_CACHE_SIZE
- mov $DATA_CACHE_SIZE, %RCX_LP
-#else
- mov __x86_data_cache_size(%rip), %RCX_LP
-#endif
- shl $1, %rcx
- cmp %rcx, %rdx
- ja L(gobble_mem_bwd)
-
- add %rdx, %rdi
- add %rdx, %rsi
- movdqu -16(%rsi), %xmm0
- lea -16(%rdi), %r8
- mov %rdi, %r9
- and $0xf, %r9
- xor %r9, %rdi
- sub %r9, %rsi
- sub %r9, %rdx
- mov %rsi, %r9
- and $0xf, %r9
- jz L(shl_0_bwd)
- lea L(shl_table_bwd)(%rip), %r11
- sub $0x80, %rdx
- movslq (%r11, %r9, 4), %r9
- add %r11, %r9
- jmp *%r9
- ud2
-
- .p2align 4
-L(shl_0):
-
- mov %rdx, %r9
- shr $8, %r9
- add %rdx, %r9
-#ifdef DATA_CACHE_SIZE
- cmp $DATA_CACHE_SIZE_HALF, %R9_LP
-#else
- cmp __x86_data_cache_size_half(%rip), %R9_LP
-#endif
- jae L(gobble_mem_fwd)
- sub $0x80, %rdx
- .p2align 4
-L(shl_0_loop):
- movdqa (%rsi), %xmm1
- movdqa %xmm1, (%rdi)
- movaps 0x10(%rsi), %xmm2
- movaps %xmm2, 0x10(%rdi)
- movaps 0x20(%rsi), %xmm3
- movaps %xmm3, 0x20(%rdi)
- movaps 0x30(%rsi), %xmm4
- movaps %xmm4, 0x30(%rdi)
- movaps 0x40(%rsi), %xmm1
- movaps %xmm1, 0x40(%rdi)
- movaps 0x50(%rsi), %xmm2
- movaps %xmm2, 0x50(%rdi)
- movaps 0x60(%rsi), %xmm3
- movaps %xmm3, 0x60(%rdi)
- movaps 0x70(%rsi), %xmm4
- movaps %xmm4, 0x70(%rdi)
- sub $0x80, %rdx
- lea 0x80(%rsi), %rsi
- lea 0x80(%rdi), %rdi
- jae L(shl_0_loop)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_0_bwd):
- sub $0x80, %rdx
-L(copy_backward_loop):
- movaps -0x10(%rsi), %xmm1
- movaps %xmm1, -0x10(%rdi)
- movaps -0x20(%rsi), %xmm2
- movaps %xmm2, -0x20(%rdi)
- movaps -0x30(%rsi), %xmm3
- movaps %xmm3, -0x30(%rdi)
- movaps -0x40(%rsi), %xmm4
- movaps %xmm4, -0x40(%rdi)
- movaps -0x50(%rsi), %xmm5
- movaps %xmm5, -0x50(%rdi)
- movaps -0x60(%rsi), %xmm5
- movaps %xmm5, -0x60(%rdi)
- movaps -0x70(%rsi), %xmm5
- movaps %xmm5, -0x70(%rdi)
- movaps -0x80(%rsi), %xmm5
- movaps %xmm5, -0x80(%rdi)
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(copy_backward_loop)
-
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_1):
- sub $0x80, %rdx
- movaps -0x01(%rsi), %xmm1
- movaps 0x0f(%rsi), %xmm2
- movaps 0x1f(%rsi), %xmm3
- movaps 0x2f(%rsi), %xmm4
- movaps 0x3f(%rsi), %xmm5
- movaps 0x4f(%rsi), %xmm6
- movaps 0x5f(%rsi), %xmm7
- movaps 0x6f(%rsi), %xmm8
- movaps 0x7f(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $1, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $1, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $1, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $1, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $1, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $1, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $1, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_1)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_1_bwd):
- movaps -0x01(%rsi), %xmm1
-
- movaps -0x11(%rsi), %xmm2
- palignr $1, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x21(%rsi), %xmm3
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x31(%rsi), %xmm4
- palignr $1, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x41(%rsi), %xmm5
- palignr $1, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x51(%rsi), %xmm6
- palignr $1, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x61(%rsi), %xmm7
- palignr $1, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x71(%rsi), %xmm8
- palignr $1, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x81(%rsi), %xmm9
- palignr $1, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_1_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_2):
- sub $0x80, %rdx
- movaps -0x02(%rsi), %xmm1
- movaps 0x0e(%rsi), %xmm2
- movaps 0x1e(%rsi), %xmm3
- movaps 0x2e(%rsi), %xmm4
- movaps 0x3e(%rsi), %xmm5
- movaps 0x4e(%rsi), %xmm6
- movaps 0x5e(%rsi), %xmm7
- movaps 0x6e(%rsi), %xmm8
- movaps 0x7e(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $2, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $2, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $2, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $2, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $2, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $2, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $2, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_2)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_2_bwd):
- movaps -0x02(%rsi), %xmm1
-
- movaps -0x12(%rsi), %xmm2
- palignr $2, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x22(%rsi), %xmm3
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x32(%rsi), %xmm4
- palignr $2, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x42(%rsi), %xmm5
- palignr $2, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x52(%rsi), %xmm6
- palignr $2, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x62(%rsi), %xmm7
- palignr $2, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x72(%rsi), %xmm8
- palignr $2, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x82(%rsi), %xmm9
- palignr $2, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_2_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_3):
- sub $0x80, %rdx
- movaps -0x03(%rsi), %xmm1
- movaps 0x0d(%rsi), %xmm2
- movaps 0x1d(%rsi), %xmm3
- movaps 0x2d(%rsi), %xmm4
- movaps 0x3d(%rsi), %xmm5
- movaps 0x4d(%rsi), %xmm6
- movaps 0x5d(%rsi), %xmm7
- movaps 0x6d(%rsi), %xmm8
- movaps 0x7d(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $3, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $3, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $3, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $3, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $3, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $3, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $3, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_3)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_3_bwd):
- movaps -0x03(%rsi), %xmm1
-
- movaps -0x13(%rsi), %xmm2
- palignr $3, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x23(%rsi), %xmm3
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x33(%rsi), %xmm4
- palignr $3, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x43(%rsi), %xmm5
- palignr $3, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x53(%rsi), %xmm6
- palignr $3, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x63(%rsi), %xmm7
- palignr $3, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x73(%rsi), %xmm8
- palignr $3, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x83(%rsi), %xmm9
- palignr $3, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_3_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_4):
- sub $0x80, %rdx
- movaps -0x04(%rsi), %xmm1
- movaps 0x0c(%rsi), %xmm2
- movaps 0x1c(%rsi), %xmm3
- movaps 0x2c(%rsi), %xmm4
- movaps 0x3c(%rsi), %xmm5
- movaps 0x4c(%rsi), %xmm6
- movaps 0x5c(%rsi), %xmm7
- movaps 0x6c(%rsi), %xmm8
- movaps 0x7c(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $4, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $4, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $4, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $4, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $4, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $4, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $4, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_4)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_4_bwd):
- movaps -0x04(%rsi), %xmm1
-
- movaps -0x14(%rsi), %xmm2
- palignr $4, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x24(%rsi), %xmm3
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x34(%rsi), %xmm4
- palignr $4, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x44(%rsi), %xmm5
- palignr $4, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x54(%rsi), %xmm6
- palignr $4, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x64(%rsi), %xmm7
- palignr $4, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x74(%rsi), %xmm8
- palignr $4, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x84(%rsi), %xmm9
- palignr $4, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_4_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_5):
- sub $0x80, %rdx
- movaps -0x05(%rsi), %xmm1
- movaps 0x0b(%rsi), %xmm2
- movaps 0x1b(%rsi), %xmm3
- movaps 0x2b(%rsi), %xmm4
- movaps 0x3b(%rsi), %xmm5
- movaps 0x4b(%rsi), %xmm6
- movaps 0x5b(%rsi), %xmm7
- movaps 0x6b(%rsi), %xmm8
- movaps 0x7b(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $5, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $5, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $5, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $5, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $5, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $5, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $5, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_5)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_5_bwd):
- movaps -0x05(%rsi), %xmm1
-
- movaps -0x15(%rsi), %xmm2
- palignr $5, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x25(%rsi), %xmm3
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x35(%rsi), %xmm4
- palignr $5, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x45(%rsi), %xmm5
- palignr $5, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x55(%rsi), %xmm6
- palignr $5, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x65(%rsi), %xmm7
- palignr $5, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x75(%rsi), %xmm8
- palignr $5, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x85(%rsi), %xmm9
- palignr $5, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_5_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_6):
- sub $0x80, %rdx
- movaps -0x06(%rsi), %xmm1
- movaps 0x0a(%rsi), %xmm2
- movaps 0x1a(%rsi), %xmm3
- movaps 0x2a(%rsi), %xmm4
- movaps 0x3a(%rsi), %xmm5
- movaps 0x4a(%rsi), %xmm6
- movaps 0x5a(%rsi), %xmm7
- movaps 0x6a(%rsi), %xmm8
- movaps 0x7a(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $6, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $6, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $6, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $6, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $6, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $6, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $6, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_6)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_6_bwd):
- movaps -0x06(%rsi), %xmm1
-
- movaps -0x16(%rsi), %xmm2
- palignr $6, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x26(%rsi), %xmm3
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x36(%rsi), %xmm4
- palignr $6, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x46(%rsi), %xmm5
- palignr $6, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x56(%rsi), %xmm6
- palignr $6, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x66(%rsi), %xmm7
- palignr $6, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x76(%rsi), %xmm8
- palignr $6, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x86(%rsi), %xmm9
- palignr $6, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_6_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_7):
- sub $0x80, %rdx
- movaps -0x07(%rsi), %xmm1
- movaps 0x09(%rsi), %xmm2
- movaps 0x19(%rsi), %xmm3
- movaps 0x29(%rsi), %xmm4
- movaps 0x39(%rsi), %xmm5
- movaps 0x49(%rsi), %xmm6
- movaps 0x59(%rsi), %xmm7
- movaps 0x69(%rsi), %xmm8
- movaps 0x79(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $7, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $7, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $7, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $7, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $7, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $7, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $7, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_7)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_7_bwd):
- movaps -0x07(%rsi), %xmm1
-
- movaps -0x17(%rsi), %xmm2
- palignr $7, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x27(%rsi), %xmm3
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x37(%rsi), %xmm4
- palignr $7, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x47(%rsi), %xmm5
- palignr $7, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x57(%rsi), %xmm6
- palignr $7, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x67(%rsi), %xmm7
- palignr $7, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x77(%rsi), %xmm8
- palignr $7, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x87(%rsi), %xmm9
- palignr $7, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_7_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_8):
- sub $0x80, %rdx
- movaps -0x08(%rsi), %xmm1
- movaps 0x08(%rsi), %xmm2
- movaps 0x18(%rsi), %xmm3
- movaps 0x28(%rsi), %xmm4
- movaps 0x38(%rsi), %xmm5
- movaps 0x48(%rsi), %xmm6
- movaps 0x58(%rsi), %xmm7
- movaps 0x68(%rsi), %xmm8
- movaps 0x78(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $8, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $8, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $8, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $8, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $8, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $8, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $8, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_8)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_8_bwd):
- movaps -0x08(%rsi), %xmm1
-
- movaps -0x18(%rsi), %xmm2
- palignr $8, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x28(%rsi), %xmm3
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x38(%rsi), %xmm4
- palignr $8, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x48(%rsi), %xmm5
- palignr $8, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x58(%rsi), %xmm6
- palignr $8, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x68(%rsi), %xmm7
- palignr $8, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x78(%rsi), %xmm8
- palignr $8, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x88(%rsi), %xmm9
- palignr $8, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_8_bwd)
-L(shl_8_end_bwd):
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_9):
- sub $0x80, %rdx
- movaps -0x09(%rsi), %xmm1
- movaps 0x07(%rsi), %xmm2
- movaps 0x17(%rsi), %xmm3
- movaps 0x27(%rsi), %xmm4
- movaps 0x37(%rsi), %xmm5
- movaps 0x47(%rsi), %xmm6
- movaps 0x57(%rsi), %xmm7
- movaps 0x67(%rsi), %xmm8
- movaps 0x77(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $9, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $9, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $9, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $9, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $9, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $9, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $9, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_9)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_9_bwd):
- movaps -0x09(%rsi), %xmm1
-
- movaps -0x19(%rsi), %xmm2
- palignr $9, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x29(%rsi), %xmm3
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x39(%rsi), %xmm4
- palignr $9, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x49(%rsi), %xmm5
- palignr $9, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x59(%rsi), %xmm6
- palignr $9, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x69(%rsi), %xmm7
- palignr $9, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x79(%rsi), %xmm8
- palignr $9, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x89(%rsi), %xmm9
- palignr $9, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_9_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_10):
- sub $0x80, %rdx
- movaps -0x0a(%rsi), %xmm1
- movaps 0x06(%rsi), %xmm2
- movaps 0x16(%rsi), %xmm3
- movaps 0x26(%rsi), %xmm4
- movaps 0x36(%rsi), %xmm5
- movaps 0x46(%rsi), %xmm6
- movaps 0x56(%rsi), %xmm7
- movaps 0x66(%rsi), %xmm8
- movaps 0x76(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $10, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $10, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $10, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $10, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $10, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $10, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $10, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_10)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_10_bwd):
- movaps -0x0a(%rsi), %xmm1
-
- movaps -0x1a(%rsi), %xmm2
- palignr $10, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2a(%rsi), %xmm3
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3a(%rsi), %xmm4
- palignr $10, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4a(%rsi), %xmm5
- palignr $10, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5a(%rsi), %xmm6
- palignr $10, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6a(%rsi), %xmm7
- palignr $10, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7a(%rsi), %xmm8
- palignr $10, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8a(%rsi), %xmm9
- palignr $10, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_10_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_11):
- sub $0x80, %rdx
- movaps -0x0b(%rsi), %xmm1
- movaps 0x05(%rsi), %xmm2
- movaps 0x15(%rsi), %xmm3
- movaps 0x25(%rsi), %xmm4
- movaps 0x35(%rsi), %xmm5
- movaps 0x45(%rsi), %xmm6
- movaps 0x55(%rsi), %xmm7
- movaps 0x65(%rsi), %xmm8
- movaps 0x75(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $11, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $11, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $11, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $11, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $11, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $11, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $11, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_11)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_11_bwd):
- movaps -0x0b(%rsi), %xmm1
-
- movaps -0x1b(%rsi), %xmm2
- palignr $11, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2b(%rsi), %xmm3
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3b(%rsi), %xmm4
- palignr $11, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4b(%rsi), %xmm5
- palignr $11, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5b(%rsi), %xmm6
- palignr $11, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6b(%rsi), %xmm7
- palignr $11, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7b(%rsi), %xmm8
- palignr $11, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8b(%rsi), %xmm9
- palignr $11, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_11_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_12):
- sub $0x80, %rdx
- movdqa -0x0c(%rsi), %xmm1
- movaps 0x04(%rsi), %xmm2
- movaps 0x14(%rsi), %xmm3
- movaps 0x24(%rsi), %xmm4
- movaps 0x34(%rsi), %xmm5
- movaps 0x44(%rsi), %xmm6
- movaps 0x54(%rsi), %xmm7
- movaps 0x64(%rsi), %xmm8
- movaps 0x74(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $12, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $12, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $12, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $12, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $12, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $12, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $12, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
-
- lea 0x80(%rdi), %rdi
- jae L(shl_12)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_12_bwd):
- movaps -0x0c(%rsi), %xmm1
-
- movaps -0x1c(%rsi), %xmm2
- palignr $12, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2c(%rsi), %xmm3
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3c(%rsi), %xmm4
- palignr $12, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4c(%rsi), %xmm5
- palignr $12, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5c(%rsi), %xmm6
- palignr $12, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6c(%rsi), %xmm7
- palignr $12, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7c(%rsi), %xmm8
- palignr $12, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8c(%rsi), %xmm9
- palignr $12, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_12_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_13):
- sub $0x80, %rdx
- movaps -0x0d(%rsi), %xmm1
- movaps 0x03(%rsi), %xmm2
- movaps 0x13(%rsi), %xmm3
- movaps 0x23(%rsi), %xmm4
- movaps 0x33(%rsi), %xmm5
- movaps 0x43(%rsi), %xmm6
- movaps 0x53(%rsi), %xmm7
- movaps 0x63(%rsi), %xmm8
- movaps 0x73(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $13, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $13, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $13, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $13, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $13, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $13, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $13, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_13)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_13_bwd):
- movaps -0x0d(%rsi), %xmm1
-
- movaps -0x1d(%rsi), %xmm2
- palignr $13, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2d(%rsi), %xmm3
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3d(%rsi), %xmm4
- palignr $13, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4d(%rsi), %xmm5
- palignr $13, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5d(%rsi), %xmm6
- palignr $13, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6d(%rsi), %xmm7
- palignr $13, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7d(%rsi), %xmm8
- palignr $13, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8d(%rsi), %xmm9
- palignr $13, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_13_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_14):
- sub $0x80, %rdx
- movaps -0x0e(%rsi), %xmm1
- movaps 0x02(%rsi), %xmm2
- movaps 0x12(%rsi), %xmm3
- movaps 0x22(%rsi), %xmm4
- movaps 0x32(%rsi), %xmm5
- movaps 0x42(%rsi), %xmm6
- movaps 0x52(%rsi), %xmm7
- movaps 0x62(%rsi), %xmm8
- movaps 0x72(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $14, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $14, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $14, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $14, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $14, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $14, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $14, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_14)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_14_bwd):
- movaps -0x0e(%rsi), %xmm1
-
- movaps -0x1e(%rsi), %xmm2
- palignr $14, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2e(%rsi), %xmm3
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3e(%rsi), %xmm4
- palignr $14, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4e(%rsi), %xmm5
- palignr $14, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5e(%rsi), %xmm6
- palignr $14, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6e(%rsi), %xmm7
- palignr $14, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7e(%rsi), %xmm8
- palignr $14, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8e(%rsi), %xmm9
- palignr $14, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_14_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_15):
- sub $0x80, %rdx
- movaps -0x0f(%rsi), %xmm1
- movaps 0x01(%rsi), %xmm2
- movaps 0x11(%rsi), %xmm3
- movaps 0x21(%rsi), %xmm4
- movaps 0x31(%rsi), %xmm5
- movaps 0x41(%rsi), %xmm6
- movaps 0x51(%rsi), %xmm7
- movaps 0x61(%rsi), %xmm8
- movaps 0x71(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $15, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $15, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $15, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $15, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $15, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $15, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $15, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_15)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_15_bwd):
- movaps -0x0f(%rsi), %xmm1
-
- movaps -0x1f(%rsi), %xmm2
- palignr $15, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2f(%rsi), %xmm3
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3f(%rsi), %xmm4
- palignr $15, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4f(%rsi), %xmm5
- palignr $15, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5f(%rsi), %xmm6
- palignr $15, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6f(%rsi), %xmm7
- palignr $15, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7f(%rsi), %xmm8
- palignr $15, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8f(%rsi), %xmm9
- palignr $15, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_15_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(gobble_mem_fwd):
- movdqu (%rsi), %xmm1
- movdqu %xmm0, (%r8)
- movdqa %xmm1, (%rdi)
- sub $16, %rdx
- add $16, %rsi
- add $16, %rdi
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-#ifdef USE_AS_MEMMOVE
- mov %rsi, %r9
- sub %rdi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_fwd)
- cmp %rcx, %r9
- jbe L(ll_cache_copy_fwd_start)
-L(memmove_is_memcpy_fwd):
-#endif
- cmp %rcx, %rdx
- ja L(bigger_in_fwd)
- mov %rdx, %rcx
-L(bigger_in_fwd):
- sub %rcx, %rdx
- cmp $0x1000, %rdx
- jbe L(ll_cache_copy_fwd)
-
- mov %rcx, %r9
- shl $3, %r9
- cmp %r9, %rdx
- jbe L(2steps_copy_fwd)
- add %rcx, %rdx
- xor %rcx, %rcx
-L(2steps_copy_fwd):
- sub $0x80, %rdx
-L(gobble_mem_fwd_loop):
- sub $0x80, %rdx
- prefetcht0 0x200(%rsi)
- prefetcht0 0x300(%rsi)
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- lfence
- movntdq %xmm0, (%rdi)
- movntdq %xmm1, 0x10(%rdi)
- movntdq %xmm2, 0x20(%rdi)
- movntdq %xmm3, 0x30(%rdi)
- movntdq %xmm4, 0x40(%rdi)
- movntdq %xmm5, 0x50(%rdi)
- movntdq %xmm6, 0x60(%rdi)
- movntdq %xmm7, 0x70(%rdi)
- lea 0x80(%rsi), %rsi
- lea 0x80(%rdi), %rdi
- jae L(gobble_mem_fwd_loop)
- sfence
- cmp $0x80, %rcx
- jb L(gobble_mem_fwd_end)
- add $0x80, %rdx
-L(ll_cache_copy_fwd):
- add %rcx, %rdx
-L(ll_cache_copy_fwd_start):
- sub $0x80, %rdx
-L(gobble_ll_loop_fwd):
- prefetchnta 0x1c0(%rsi)
- prefetchnta 0x280(%rsi)
- prefetchnta 0x1c0(%rdi)
- prefetchnta 0x280(%rdi)
- sub $0x80, %rdx
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
- movdqa %xmm2, 0x20(%rdi)
- movdqa %xmm3, 0x30(%rdi)
- movdqa %xmm4, 0x40(%rdi)
- movdqa %xmm5, 0x50(%rdi)
- movdqa %xmm6, 0x60(%rdi)
- movdqa %xmm7, 0x70(%rdi)
- lea 0x80(%rsi), %rsi
- lea 0x80(%rdi), %rdi
- jae L(gobble_ll_loop_fwd)
-L(gobble_mem_fwd_end):
- add $0x80, %rdx
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(gobble_mem_bwd):
- add %rdx, %rsi
- add %rdx, %rdi
-
- movdqu -16(%rsi), %xmm0
- lea -16(%rdi), %r8
- mov %rdi, %r9
- and $-16, %rdi
- sub %rdi, %r9
- sub %r9, %rsi
- sub %r9, %rdx
-
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-#ifdef USE_AS_MEMMOVE
- mov %rdi, %r9
- sub %rsi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_bwd)
- cmp %rcx, %r9
- jbe L(ll_cache_copy_bwd_start)
-L(memmove_is_memcpy_bwd):
-#endif
- cmp %rcx, %rdx
- ja L(bigger)
- mov %rdx, %rcx
-L(bigger):
- sub %rcx, %rdx
- cmp $0x1000, %rdx
- jbe L(ll_cache_copy)
-
- mov %rcx, %r9
- shl $3, %r9
- cmp %r9, %rdx
- jbe L(2steps_copy)
- add %rcx, %rdx
- xor %rcx, %rcx
-L(2steps_copy):
- sub $0x80, %rdx
-L(gobble_mem_bwd_loop):
- sub $0x80, %rdx
- prefetcht0 -0x200(%rsi)
- prefetcht0 -0x300(%rsi)
- movdqu -0x10(%rsi), %xmm1
- movdqu -0x20(%rsi), %xmm2
- movdqu -0x30(%rsi), %xmm3
- movdqu -0x40(%rsi), %xmm4
- movdqu -0x50(%rsi), %xmm5
- movdqu -0x60(%rsi), %xmm6
- movdqu -0x70(%rsi), %xmm7
- movdqu -0x80(%rsi), %xmm8
- lfence
- movntdq %xmm1, -0x10(%rdi)
- movntdq %xmm2, -0x20(%rdi)
- movntdq %xmm3, -0x30(%rdi)
- movntdq %xmm4, -0x40(%rdi)
- movntdq %xmm5, -0x50(%rdi)
- movntdq %xmm6, -0x60(%rdi)
- movntdq %xmm7, -0x70(%rdi)
- movntdq %xmm8, -0x80(%rdi)
- lea -0x80(%rsi), %rsi
- lea -0x80(%rdi), %rdi
- jae L(gobble_mem_bwd_loop)
- sfence
- cmp $0x80, %rcx
- jb L(gobble_mem_bwd_end)
- add $0x80, %rdx
-L(ll_cache_copy):
- add %rcx, %rdx
-L(ll_cache_copy_bwd_start):
- sub $0x80, %rdx
-L(gobble_ll_loop):
- prefetchnta -0x1c0(%rsi)
- prefetchnta -0x280(%rsi)
- prefetchnta -0x1c0(%rdi)
- prefetchnta -0x280(%rdi)
- sub $0x80, %rdx
- movdqu -0x10(%rsi), %xmm1
- movdqu -0x20(%rsi), %xmm2
- movdqu -0x30(%rsi), %xmm3
- movdqu -0x40(%rsi), %xmm4
- movdqu -0x50(%rsi), %xmm5
- movdqu -0x60(%rsi), %xmm6
- movdqu -0x70(%rsi), %xmm7
- movdqu -0x80(%rsi), %xmm8
- movdqa %xmm1, -0x10(%rdi)
- movdqa %xmm2, -0x20(%rdi)
- movdqa %xmm3, -0x30(%rdi)
- movdqa %xmm4, -0x40(%rdi)
- movdqa %xmm5, -0x50(%rdi)
- movdqa %xmm6, -0x60(%rdi)
- movdqa %xmm7, -0x70(%rdi)
- movdqa %xmm8, -0x80(%rdi)
- lea -0x80(%rsi), %rsi
- lea -0x80(%rdi), %rdi
- jae L(gobble_ll_loop)
-L(gobble_mem_bwd_end):
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rsi
- sub %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(fwd_write_128bytes):
- lddqu -128(%rsi), %xmm0
- movdqu %xmm0, -128(%rdi)
-L(fwd_write_112bytes):
- lddqu -112(%rsi), %xmm0
- movdqu %xmm0, -112(%rdi)
-L(fwd_write_96bytes):
- lddqu -96(%rsi), %xmm0
- movdqu %xmm0, -96(%rdi)
-L(fwd_write_80bytes):
- lddqu -80(%rsi), %xmm0
- movdqu %xmm0, -80(%rdi)
-L(fwd_write_64bytes):
- lddqu -64(%rsi), %xmm0
- movdqu %xmm0, -64(%rdi)
-L(fwd_write_48bytes):
- lddqu -48(%rsi), %xmm0
- movdqu %xmm0, -48(%rdi)
-L(fwd_write_32bytes):
- lddqu -32(%rsi), %xmm0
- movdqu %xmm0, -32(%rdi)
-L(fwd_write_16bytes):
- lddqu -16(%rsi), %xmm0
- movdqu %xmm0, -16(%rdi)
-L(fwd_write_0bytes):
- ret
-
-
- .p2align 4
-L(fwd_write_143bytes):
- lddqu -143(%rsi), %xmm0
- movdqu %xmm0, -143(%rdi)
-L(fwd_write_127bytes):
- lddqu -127(%rsi), %xmm0
- movdqu %xmm0, -127(%rdi)
-L(fwd_write_111bytes):
- lddqu -111(%rsi), %xmm0
- movdqu %xmm0, -111(%rdi)
-L(fwd_write_95bytes):
- lddqu -95(%rsi), %xmm0
- movdqu %xmm0, -95(%rdi)
-L(fwd_write_79bytes):
- lddqu -79(%rsi), %xmm0
- movdqu %xmm0, -79(%rdi)
-L(fwd_write_63bytes):
- lddqu -63(%rsi), %xmm0
- movdqu %xmm0, -63(%rdi)
-L(fwd_write_47bytes):
- lddqu -47(%rsi), %xmm0
- movdqu %xmm0, -47(%rdi)
-L(fwd_write_31bytes):
- lddqu -31(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -31(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_15bytes):
- mov -15(%rsi), %rdx
- mov -8(%rsi), %rcx
- mov %rdx, -15(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_142bytes):
- lddqu -142(%rsi), %xmm0
- movdqu %xmm0, -142(%rdi)
-L(fwd_write_126bytes):
- lddqu -126(%rsi), %xmm0
- movdqu %xmm0, -126(%rdi)
-L(fwd_write_110bytes):
- lddqu -110(%rsi), %xmm0
- movdqu %xmm0, -110(%rdi)
-L(fwd_write_94bytes):
- lddqu -94(%rsi), %xmm0
- movdqu %xmm0, -94(%rdi)
-L(fwd_write_78bytes):
- lddqu -78(%rsi), %xmm0
- movdqu %xmm0, -78(%rdi)
-L(fwd_write_62bytes):
- lddqu -62(%rsi), %xmm0
- movdqu %xmm0, -62(%rdi)
-L(fwd_write_46bytes):
- lddqu -46(%rsi), %xmm0
- movdqu %xmm0, -46(%rdi)
-L(fwd_write_30bytes):
- lddqu -30(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -30(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_14bytes):
- mov -14(%rsi), %rdx
- mov -8(%rsi), %rcx
- mov %rdx, -14(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_141bytes):
- lddqu -141(%rsi), %xmm0
- movdqu %xmm0, -141(%rdi)
-L(fwd_write_125bytes):
- lddqu -125(%rsi), %xmm0
- movdqu %xmm0, -125(%rdi)
-L(fwd_write_109bytes):
- lddqu -109(%rsi), %xmm0
- movdqu %xmm0, -109(%rdi)
-L(fwd_write_93bytes):
- lddqu -93(%rsi), %xmm0
- movdqu %xmm0, -93(%rdi)
-L(fwd_write_77bytes):
- lddqu -77(%rsi), %xmm0
- movdqu %xmm0, -77(%rdi)
-L(fwd_write_61bytes):
- lddqu -61(%rsi), %xmm0
- movdqu %xmm0, -61(%rdi)
-L(fwd_write_45bytes):
- lddqu -45(%rsi), %xmm0
- movdqu %xmm0, -45(%rdi)
-L(fwd_write_29bytes):
- lddqu -29(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -29(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_13bytes):
- mov -13(%rsi), %rdx
- mov -8(%rsi), %rcx
- mov %rdx, -13(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_140bytes):
- lddqu -140(%rsi), %xmm0
- movdqu %xmm0, -140(%rdi)
-L(fwd_write_124bytes):
- lddqu -124(%rsi), %xmm0
- movdqu %xmm0, -124(%rdi)
-L(fwd_write_108bytes):
- lddqu -108(%rsi), %xmm0
- movdqu %xmm0, -108(%rdi)
-L(fwd_write_92bytes):
- lddqu -92(%rsi), %xmm0
- movdqu %xmm0, -92(%rdi)
-L(fwd_write_76bytes):
- lddqu -76(%rsi), %xmm0
- movdqu %xmm0, -76(%rdi)
-L(fwd_write_60bytes):
- lddqu -60(%rsi), %xmm0
- movdqu %xmm0, -60(%rdi)
-L(fwd_write_44bytes):
- lddqu -44(%rsi), %xmm0
- movdqu %xmm0, -44(%rdi)
-L(fwd_write_28bytes):
- lddqu -28(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -28(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_12bytes):
- mov -12(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -12(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_139bytes):
- lddqu -139(%rsi), %xmm0
- movdqu %xmm0, -139(%rdi)
-L(fwd_write_123bytes):
- lddqu -123(%rsi), %xmm0
- movdqu %xmm0, -123(%rdi)
-L(fwd_write_107bytes):
- lddqu -107(%rsi), %xmm0
- movdqu %xmm0, -107(%rdi)
-L(fwd_write_91bytes):
- lddqu -91(%rsi), %xmm0
- movdqu %xmm0, -91(%rdi)
-L(fwd_write_75bytes):
- lddqu -75(%rsi), %xmm0
- movdqu %xmm0, -75(%rdi)
-L(fwd_write_59bytes):
- lddqu -59(%rsi), %xmm0
- movdqu %xmm0, -59(%rdi)
-L(fwd_write_43bytes):
- lddqu -43(%rsi), %xmm0
- movdqu %xmm0, -43(%rdi)
-L(fwd_write_27bytes):
- lddqu -27(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -27(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_11bytes):
- mov -11(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -11(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_138bytes):
- lddqu -138(%rsi), %xmm0
- movdqu %xmm0, -138(%rdi)
-L(fwd_write_122bytes):
- lddqu -122(%rsi), %xmm0
- movdqu %xmm0, -122(%rdi)
-L(fwd_write_106bytes):
- lddqu -106(%rsi), %xmm0
- movdqu %xmm0, -106(%rdi)
-L(fwd_write_90bytes):
- lddqu -90(%rsi), %xmm0
- movdqu %xmm0, -90(%rdi)
-L(fwd_write_74bytes):
- lddqu -74(%rsi), %xmm0
- movdqu %xmm0, -74(%rdi)
-L(fwd_write_58bytes):
- lddqu -58(%rsi), %xmm0
- movdqu %xmm0, -58(%rdi)
-L(fwd_write_42bytes):
- lddqu -42(%rsi), %xmm0
- movdqu %xmm0, -42(%rdi)
-L(fwd_write_26bytes):
- lddqu -26(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -26(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_10bytes):
- mov -10(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -10(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_137bytes):
- lddqu -137(%rsi), %xmm0
- movdqu %xmm0, -137(%rdi)
-L(fwd_write_121bytes):
- lddqu -121(%rsi), %xmm0
- movdqu %xmm0, -121(%rdi)
-L(fwd_write_105bytes):
- lddqu -105(%rsi), %xmm0
- movdqu %xmm0, -105(%rdi)
-L(fwd_write_89bytes):
- lddqu -89(%rsi), %xmm0
- movdqu %xmm0, -89(%rdi)
-L(fwd_write_73bytes):
- lddqu -73(%rsi), %xmm0
- movdqu %xmm0, -73(%rdi)
-L(fwd_write_57bytes):
- lddqu -57(%rsi), %xmm0
- movdqu %xmm0, -57(%rdi)
-L(fwd_write_41bytes):
- lddqu -41(%rsi), %xmm0
- movdqu %xmm0, -41(%rdi)
-L(fwd_write_25bytes):
- lddqu -25(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -25(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_9bytes):
- mov -9(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -9(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_136bytes):
- lddqu -136(%rsi), %xmm0
- movdqu %xmm0, -136(%rdi)
-L(fwd_write_120bytes):
- lddqu -120(%rsi), %xmm0
- movdqu %xmm0, -120(%rdi)
-L(fwd_write_104bytes):
- lddqu -104(%rsi), %xmm0
- movdqu %xmm0, -104(%rdi)
-L(fwd_write_88bytes):
- lddqu -88(%rsi), %xmm0
- movdqu %xmm0, -88(%rdi)
-L(fwd_write_72bytes):
- lddqu -72(%rsi), %xmm0
- movdqu %xmm0, -72(%rdi)
-L(fwd_write_56bytes):
- lddqu -56(%rsi), %xmm0
- movdqu %xmm0, -56(%rdi)
-L(fwd_write_40bytes):
- lddqu -40(%rsi), %xmm0
- movdqu %xmm0, -40(%rdi)
-L(fwd_write_24bytes):
- lddqu -24(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -24(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_8bytes):
- mov -8(%rsi), %rdx
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_135bytes):
- lddqu -135(%rsi), %xmm0
- movdqu %xmm0, -135(%rdi)
-L(fwd_write_119bytes):
- lddqu -119(%rsi), %xmm0
- movdqu %xmm0, -119(%rdi)
-L(fwd_write_103bytes):
- lddqu -103(%rsi), %xmm0
- movdqu %xmm0, -103(%rdi)
-L(fwd_write_87bytes):
- lddqu -87(%rsi), %xmm0
- movdqu %xmm0, -87(%rdi)
-L(fwd_write_71bytes):
- lddqu -71(%rsi), %xmm0
- movdqu %xmm0, -71(%rdi)
-L(fwd_write_55bytes):
- lddqu -55(%rsi), %xmm0
- movdqu %xmm0, -55(%rdi)
-L(fwd_write_39bytes):
- lddqu -39(%rsi), %xmm0
- movdqu %xmm0, -39(%rdi)
-L(fwd_write_23bytes):
- lddqu -23(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -23(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_7bytes):
- mov -7(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -7(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_134bytes):
- lddqu -134(%rsi), %xmm0
- movdqu %xmm0, -134(%rdi)
-L(fwd_write_118bytes):
- lddqu -118(%rsi), %xmm0
- movdqu %xmm0, -118(%rdi)
-L(fwd_write_102bytes):
- lddqu -102(%rsi), %xmm0
- movdqu %xmm0, -102(%rdi)
-L(fwd_write_86bytes):
- lddqu -86(%rsi), %xmm0
- movdqu %xmm0, -86(%rdi)
-L(fwd_write_70bytes):
- lddqu -70(%rsi), %xmm0
- movdqu %xmm0, -70(%rdi)
-L(fwd_write_54bytes):
- lddqu -54(%rsi), %xmm0
- movdqu %xmm0, -54(%rdi)
-L(fwd_write_38bytes):
- lddqu -38(%rsi), %xmm0
- movdqu %xmm0, -38(%rdi)
-L(fwd_write_22bytes):
- lddqu -22(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -22(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_6bytes):
- mov -6(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -6(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_133bytes):
- lddqu -133(%rsi), %xmm0
- movdqu %xmm0, -133(%rdi)
-L(fwd_write_117bytes):
- lddqu -117(%rsi), %xmm0
- movdqu %xmm0, -117(%rdi)
-L(fwd_write_101bytes):
- lddqu -101(%rsi), %xmm0
- movdqu %xmm0, -101(%rdi)
-L(fwd_write_85bytes):
- lddqu -85(%rsi), %xmm0
- movdqu %xmm0, -85(%rdi)
-L(fwd_write_69bytes):
- lddqu -69(%rsi), %xmm0
- movdqu %xmm0, -69(%rdi)
-L(fwd_write_53bytes):
- lddqu -53(%rsi), %xmm0
- movdqu %xmm0, -53(%rdi)
-L(fwd_write_37bytes):
- lddqu -37(%rsi), %xmm0
- movdqu %xmm0, -37(%rdi)
-L(fwd_write_21bytes):
- lddqu -21(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -21(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_5bytes):
- mov -5(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -5(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_132bytes):
- lddqu -132(%rsi), %xmm0
- movdqu %xmm0, -132(%rdi)
-L(fwd_write_116bytes):
- lddqu -116(%rsi), %xmm0
- movdqu %xmm0, -116(%rdi)
-L(fwd_write_100bytes):
- lddqu -100(%rsi), %xmm0
- movdqu %xmm0, -100(%rdi)
-L(fwd_write_84bytes):
- lddqu -84(%rsi), %xmm0
- movdqu %xmm0, -84(%rdi)
-L(fwd_write_68bytes):
- lddqu -68(%rsi), %xmm0
- movdqu %xmm0, -68(%rdi)
-L(fwd_write_52bytes):
- lddqu -52(%rsi), %xmm0
- movdqu %xmm0, -52(%rdi)
-L(fwd_write_36bytes):
- lddqu -36(%rsi), %xmm0
- movdqu %xmm0, -36(%rdi)
-L(fwd_write_20bytes):
- lddqu -20(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -20(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_4bytes):
- mov -4(%rsi), %edx
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_131bytes):
- lddqu -131(%rsi), %xmm0
- movdqu %xmm0, -131(%rdi)
-L(fwd_write_115bytes):
- lddqu -115(%rsi), %xmm0
- movdqu %xmm0, -115(%rdi)
-L(fwd_write_99bytes):
- lddqu -99(%rsi), %xmm0
- movdqu %xmm0, -99(%rdi)
-L(fwd_write_83bytes):
- lddqu -83(%rsi), %xmm0
- movdqu %xmm0, -83(%rdi)
-L(fwd_write_67bytes):
- lddqu -67(%rsi), %xmm0
- movdqu %xmm0, -67(%rdi)
-L(fwd_write_51bytes):
- lddqu -51(%rsi), %xmm0
- movdqu %xmm0, -51(%rdi)
-L(fwd_write_35bytes):
- lddqu -35(%rsi), %xmm0
- movdqu %xmm0, -35(%rdi)
-L(fwd_write_19bytes):
- lddqu -19(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -19(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_3bytes):
- mov -3(%rsi), %dx
- mov -2(%rsi), %cx
- mov %dx, -3(%rdi)
- mov %cx, -2(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_130bytes):
- lddqu -130(%rsi), %xmm0
- movdqu %xmm0, -130(%rdi)
-L(fwd_write_114bytes):
- lddqu -114(%rsi), %xmm0
- movdqu %xmm0, -114(%rdi)
-L(fwd_write_98bytes):
- lddqu -98(%rsi), %xmm0
- movdqu %xmm0, -98(%rdi)
-L(fwd_write_82bytes):
- lddqu -82(%rsi), %xmm0
- movdqu %xmm0, -82(%rdi)
-L(fwd_write_66bytes):
- lddqu -66(%rsi), %xmm0
- movdqu %xmm0, -66(%rdi)
-L(fwd_write_50bytes):
- lddqu -50(%rsi), %xmm0
- movdqu %xmm0, -50(%rdi)
-L(fwd_write_34bytes):
- lddqu -34(%rsi), %xmm0
- movdqu %xmm0, -34(%rdi)
-L(fwd_write_18bytes):
- lddqu -18(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -18(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_2bytes):
- movzwl -2(%rsi), %edx
- mov %dx, -2(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_129bytes):
- lddqu -129(%rsi), %xmm0
- movdqu %xmm0, -129(%rdi)
-L(fwd_write_113bytes):
- lddqu -113(%rsi), %xmm0
- movdqu %xmm0, -113(%rdi)
-L(fwd_write_97bytes):
- lddqu -97(%rsi), %xmm0
- movdqu %xmm0, -97(%rdi)
-L(fwd_write_81bytes):
- lddqu -81(%rsi), %xmm0
- movdqu %xmm0, -81(%rdi)
-L(fwd_write_65bytes):
- lddqu -65(%rsi), %xmm0
- movdqu %xmm0, -65(%rdi)
-L(fwd_write_49bytes):
- lddqu -49(%rsi), %xmm0
- movdqu %xmm0, -49(%rdi)
-L(fwd_write_33bytes):
- lddqu -33(%rsi), %xmm0
- movdqu %xmm0, -33(%rdi)
-L(fwd_write_17bytes):
- lddqu -17(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -17(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_1bytes):
- movzbl -1(%rsi), %edx
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(bwd_write_128bytes):
- lddqu 112(%rsi), %xmm0
- movdqu %xmm0, 112(%rdi)
-L(bwd_write_112bytes):
- lddqu 96(%rsi), %xmm0
- movdqu %xmm0, 96(%rdi)
-L(bwd_write_96bytes):
- lddqu 80(%rsi), %xmm0
- movdqu %xmm0, 80(%rdi)
-L(bwd_write_80bytes):
- lddqu 64(%rsi), %xmm0
- movdqu %xmm0, 64(%rdi)
-L(bwd_write_64bytes):
- lddqu 48(%rsi), %xmm0
- movdqu %xmm0, 48(%rdi)
-L(bwd_write_48bytes):
- lddqu 32(%rsi), %xmm0
- movdqu %xmm0, 32(%rdi)
-L(bwd_write_32bytes):
- lddqu 16(%rsi), %xmm0
- movdqu %xmm0, 16(%rdi)
-L(bwd_write_16bytes):
- lddqu (%rsi), %xmm0
- movdqu %xmm0, (%rdi)
-L(bwd_write_0bytes):
- ret
-
- .p2align 4
-L(bwd_write_143bytes):
- lddqu 127(%rsi), %xmm0
- movdqu %xmm0, 127(%rdi)
-L(bwd_write_127bytes):
- lddqu 111(%rsi), %xmm0
- movdqu %xmm0, 111(%rdi)
-L(bwd_write_111bytes):
- lddqu 95(%rsi), %xmm0
- movdqu %xmm0, 95(%rdi)
-L(bwd_write_95bytes):
- lddqu 79(%rsi), %xmm0
- movdqu %xmm0, 79(%rdi)
-L(bwd_write_79bytes):
- lddqu 63(%rsi), %xmm0
- movdqu %xmm0, 63(%rdi)
-L(bwd_write_63bytes):
- lddqu 47(%rsi), %xmm0
- movdqu %xmm0, 47(%rdi)
-L(bwd_write_47bytes):
- lddqu 31(%rsi), %xmm0
- movdqu %xmm0, 31(%rdi)
-L(bwd_write_31bytes):
- lddqu 15(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 15(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
-
- .p2align 4
-L(bwd_write_15bytes):
- mov 7(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 7(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_142bytes):
- lddqu 126(%rsi), %xmm0
- movdqu %xmm0, 126(%rdi)
-L(bwd_write_126bytes):
- lddqu 110(%rsi), %xmm0
- movdqu %xmm0, 110(%rdi)
-L(bwd_write_110bytes):
- lddqu 94(%rsi), %xmm0
- movdqu %xmm0, 94(%rdi)
-L(bwd_write_94bytes):
- lddqu 78(%rsi), %xmm0
- movdqu %xmm0, 78(%rdi)
-L(bwd_write_78bytes):
- lddqu 62(%rsi), %xmm0
- movdqu %xmm0, 62(%rdi)
-L(bwd_write_62bytes):
- lddqu 46(%rsi), %xmm0
- movdqu %xmm0, 46(%rdi)
-L(bwd_write_46bytes):
- lddqu 30(%rsi), %xmm0
- movdqu %xmm0, 30(%rdi)
-L(bwd_write_30bytes):
- lddqu 14(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 14(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_14bytes):
- mov 6(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 6(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_141bytes):
- lddqu 125(%rsi), %xmm0
- movdqu %xmm0, 125(%rdi)
-L(bwd_write_125bytes):
- lddqu 109(%rsi), %xmm0
- movdqu %xmm0, 109(%rdi)
-L(bwd_write_109bytes):
- lddqu 93(%rsi), %xmm0
- movdqu %xmm0, 93(%rdi)
-L(bwd_write_93bytes):
- lddqu 77(%rsi), %xmm0
- movdqu %xmm0, 77(%rdi)
-L(bwd_write_77bytes):
- lddqu 61(%rsi), %xmm0
- movdqu %xmm0, 61(%rdi)
-L(bwd_write_61bytes):
- lddqu 45(%rsi), %xmm0
- movdqu %xmm0, 45(%rdi)
-L(bwd_write_45bytes):
- lddqu 29(%rsi), %xmm0
- movdqu %xmm0, 29(%rdi)
-L(bwd_write_29bytes):
- lddqu 13(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 13(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_13bytes):
- mov 5(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 5(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_140bytes):
- lddqu 124(%rsi), %xmm0
- movdqu %xmm0, 124(%rdi)
-L(bwd_write_124bytes):
- lddqu 108(%rsi), %xmm0
- movdqu %xmm0, 108(%rdi)
-L(bwd_write_108bytes):
- lddqu 92(%rsi), %xmm0
- movdqu %xmm0, 92(%rdi)
-L(bwd_write_92bytes):
- lddqu 76(%rsi), %xmm0
- movdqu %xmm0, 76(%rdi)
-L(bwd_write_76bytes):
- lddqu 60(%rsi), %xmm0
- movdqu %xmm0, 60(%rdi)
-L(bwd_write_60bytes):
- lddqu 44(%rsi), %xmm0
- movdqu %xmm0, 44(%rdi)
-L(bwd_write_44bytes):
- lddqu 28(%rsi), %xmm0
- movdqu %xmm0, 28(%rdi)
-L(bwd_write_28bytes):
- lddqu 12(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 12(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_12bytes):
- mov 4(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 4(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_139bytes):
- lddqu 123(%rsi), %xmm0
- movdqu %xmm0, 123(%rdi)
-L(bwd_write_123bytes):
- lddqu 107(%rsi), %xmm0
- movdqu %xmm0, 107(%rdi)
-L(bwd_write_107bytes):
- lddqu 91(%rsi), %xmm0
- movdqu %xmm0, 91(%rdi)
-L(bwd_write_91bytes):
- lddqu 75(%rsi), %xmm0
- movdqu %xmm0, 75(%rdi)
-L(bwd_write_75bytes):
- lddqu 59(%rsi), %xmm0
- movdqu %xmm0, 59(%rdi)
-L(bwd_write_59bytes):
- lddqu 43(%rsi), %xmm0
- movdqu %xmm0, 43(%rdi)
-L(bwd_write_43bytes):
- lddqu 27(%rsi), %xmm0
- movdqu %xmm0, 27(%rdi)
-L(bwd_write_27bytes):
- lddqu 11(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 11(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_11bytes):
- mov 3(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 3(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_138bytes):
- lddqu 122(%rsi), %xmm0
- movdqu %xmm0, 122(%rdi)
-L(bwd_write_122bytes):
- lddqu 106(%rsi), %xmm0
- movdqu %xmm0, 106(%rdi)
-L(bwd_write_106bytes):
- lddqu 90(%rsi), %xmm0
- movdqu %xmm0, 90(%rdi)
-L(bwd_write_90bytes):
- lddqu 74(%rsi), %xmm0
- movdqu %xmm0, 74(%rdi)
-L(bwd_write_74bytes):
- lddqu 58(%rsi), %xmm0
- movdqu %xmm0, 58(%rdi)
-L(bwd_write_58bytes):
- lddqu 42(%rsi), %xmm0
- movdqu %xmm0, 42(%rdi)
-L(bwd_write_42bytes):
- lddqu 26(%rsi), %xmm0
- movdqu %xmm0, 26(%rdi)
-L(bwd_write_26bytes):
- lddqu 10(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 10(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_10bytes):
- mov 2(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 2(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_137bytes):
- lddqu 121(%rsi), %xmm0
- movdqu %xmm0, 121(%rdi)
-L(bwd_write_121bytes):
- lddqu 105(%rsi), %xmm0
- movdqu %xmm0, 105(%rdi)
-L(bwd_write_105bytes):
- lddqu 89(%rsi), %xmm0
- movdqu %xmm0, 89(%rdi)
-L(bwd_write_89bytes):
- lddqu 73(%rsi), %xmm0
- movdqu %xmm0, 73(%rdi)
-L(bwd_write_73bytes):
- lddqu 57(%rsi), %xmm0
- movdqu %xmm0, 57(%rdi)
-L(bwd_write_57bytes):
- lddqu 41(%rsi), %xmm0
- movdqu %xmm0, 41(%rdi)
-L(bwd_write_41bytes):
- lddqu 25(%rsi), %xmm0
- movdqu %xmm0, 25(%rdi)
-L(bwd_write_25bytes):
- lddqu 9(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 9(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_9bytes):
- mov 1(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 1(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_136bytes):
- lddqu 120(%rsi), %xmm0
- movdqu %xmm0, 120(%rdi)
-L(bwd_write_120bytes):
- lddqu 104(%rsi), %xmm0
- movdqu %xmm0, 104(%rdi)
-L(bwd_write_104bytes):
- lddqu 88(%rsi), %xmm0
- movdqu %xmm0, 88(%rdi)
-L(bwd_write_88bytes):
- lddqu 72(%rsi), %xmm0
- movdqu %xmm0, 72(%rdi)
-L(bwd_write_72bytes):
- lddqu 56(%rsi), %xmm0
- movdqu %xmm0, 56(%rdi)
-L(bwd_write_56bytes):
- lddqu 40(%rsi), %xmm0
- movdqu %xmm0, 40(%rdi)
-L(bwd_write_40bytes):
- lddqu 24(%rsi), %xmm0
- movdqu %xmm0, 24(%rdi)
-L(bwd_write_24bytes):
- lddqu 8(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 8(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_8bytes):
- mov (%rsi), %rdx
- mov %rdx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_135bytes):
- lddqu 119(%rsi), %xmm0
- movdqu %xmm0, 119(%rdi)
-L(bwd_write_119bytes):
- lddqu 103(%rsi), %xmm0
- movdqu %xmm0, 103(%rdi)
-L(bwd_write_103bytes):
- lddqu 87(%rsi), %xmm0
- movdqu %xmm0, 87(%rdi)
-L(bwd_write_87bytes):
- lddqu 71(%rsi), %xmm0
- movdqu %xmm0, 71(%rdi)
-L(bwd_write_71bytes):
- lddqu 55(%rsi), %xmm0
- movdqu %xmm0, 55(%rdi)
-L(bwd_write_55bytes):
- lddqu 39(%rsi), %xmm0
- movdqu %xmm0, 39(%rdi)
-L(bwd_write_39bytes):
- lddqu 23(%rsi), %xmm0
- movdqu %xmm0, 23(%rdi)
-L(bwd_write_23bytes):
- lddqu 7(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 7(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_7bytes):
- mov 3(%rsi), %edx
- mov (%rsi), %ecx
- mov %edx, 3(%rdi)
- mov %ecx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_134bytes):
- lddqu 118(%rsi), %xmm0
- movdqu %xmm0, 118(%rdi)
-L(bwd_write_118bytes):
- lddqu 102(%rsi), %xmm0
- movdqu %xmm0, 102(%rdi)
-L(bwd_write_102bytes):
- lddqu 86(%rsi), %xmm0
- movdqu %xmm0, 86(%rdi)
-L(bwd_write_86bytes):
- lddqu 70(%rsi), %xmm0
- movdqu %xmm0, 70(%rdi)
-L(bwd_write_70bytes):
- lddqu 54(%rsi), %xmm0
- movdqu %xmm0, 54(%rdi)
-L(bwd_write_54bytes):
- lddqu 38(%rsi), %xmm0
- movdqu %xmm0, 38(%rdi)
-L(bwd_write_38bytes):
- lddqu 22(%rsi), %xmm0
- movdqu %xmm0, 22(%rdi)
-L(bwd_write_22bytes):
- lddqu 6(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 6(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_6bytes):
- mov 2(%rsi), %edx
- mov (%rsi), %ecx
- mov %edx, 2(%rdi)
- mov %ecx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_133bytes):
- lddqu 117(%rsi), %xmm0
- movdqu %xmm0, 117(%rdi)
-L(bwd_write_117bytes):
- lddqu 101(%rsi), %xmm0
- movdqu %xmm0, 101(%rdi)
-L(bwd_write_101bytes):
- lddqu 85(%rsi), %xmm0
- movdqu %xmm0, 85(%rdi)
-L(bwd_write_85bytes):
- lddqu 69(%rsi), %xmm0
- movdqu %xmm0, 69(%rdi)
-L(bwd_write_69bytes):
- lddqu 53(%rsi), %xmm0
- movdqu %xmm0, 53(%rdi)
-L(bwd_write_53bytes):
- lddqu 37(%rsi), %xmm0
- movdqu %xmm0, 37(%rdi)
-L(bwd_write_37bytes):
- lddqu 21(%rsi), %xmm0
- movdqu %xmm0, 21(%rdi)
-L(bwd_write_21bytes):
- lddqu 5(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 5(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_5bytes):
- mov 1(%rsi), %edx
- mov (%rsi), %ecx
- mov %edx, 1(%rdi)
- mov %ecx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_132bytes):
- lddqu 116(%rsi), %xmm0
- movdqu %xmm0, 116(%rdi)
-L(bwd_write_116bytes):
- lddqu 100(%rsi), %xmm0
- movdqu %xmm0, 100(%rdi)
-L(bwd_write_100bytes):
- lddqu 84(%rsi), %xmm0
- movdqu %xmm0, 84(%rdi)
-L(bwd_write_84bytes):
- lddqu 68(%rsi), %xmm0
- movdqu %xmm0, 68(%rdi)
-L(bwd_write_68bytes):
- lddqu 52(%rsi), %xmm0
- movdqu %xmm0, 52(%rdi)
-L(bwd_write_52bytes):
- lddqu 36(%rsi), %xmm0
- movdqu %xmm0, 36(%rdi)
-L(bwd_write_36bytes):
- lddqu 20(%rsi), %xmm0
- movdqu %xmm0, 20(%rdi)
-L(bwd_write_20bytes):
- lddqu 4(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 4(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_4bytes):
- mov (%rsi), %edx
- mov %edx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_131bytes):
- lddqu 115(%rsi), %xmm0
- movdqu %xmm0, 115(%rdi)
-L(bwd_write_115bytes):
- lddqu 99(%rsi), %xmm0
- movdqu %xmm0, 99(%rdi)
-L(bwd_write_99bytes):
- lddqu 83(%rsi), %xmm0
- movdqu %xmm0, 83(%rdi)
-L(bwd_write_83bytes):
- lddqu 67(%rsi), %xmm0
- movdqu %xmm0, 67(%rdi)
-L(bwd_write_67bytes):
- lddqu 51(%rsi), %xmm0
- movdqu %xmm0, 51(%rdi)
-L(bwd_write_51bytes):
- lddqu 35(%rsi), %xmm0
- movdqu %xmm0, 35(%rdi)
-L(bwd_write_35bytes):
- lddqu 19(%rsi), %xmm0
- movdqu %xmm0, 19(%rdi)
-L(bwd_write_19bytes):
- lddqu 3(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 3(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_3bytes):
- mov 1(%rsi), %dx
- mov (%rsi), %cx
- mov %dx, 1(%rdi)
- mov %cx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_130bytes):
- lddqu 114(%rsi), %xmm0
- movdqu %xmm0, 114(%rdi)
-L(bwd_write_114bytes):
- lddqu 98(%rsi), %xmm0
- movdqu %xmm0, 98(%rdi)
-L(bwd_write_98bytes):
- lddqu 82(%rsi), %xmm0
- movdqu %xmm0, 82(%rdi)
-L(bwd_write_82bytes):
- lddqu 66(%rsi), %xmm0
- movdqu %xmm0, 66(%rdi)
-L(bwd_write_66bytes):
- lddqu 50(%rsi), %xmm0
- movdqu %xmm0, 50(%rdi)
-L(bwd_write_50bytes):
- lddqu 34(%rsi), %xmm0
- movdqu %xmm0, 34(%rdi)
-L(bwd_write_34bytes):
- lddqu 18(%rsi), %xmm0
- movdqu %xmm0, 18(%rdi)
-L(bwd_write_18bytes):
- lddqu 2(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 2(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_2bytes):
- movzwl (%rsi), %edx
- mov %dx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_129bytes):
- lddqu 113(%rsi), %xmm0
- movdqu %xmm0, 113(%rdi)
-L(bwd_write_113bytes):
- lddqu 97(%rsi), %xmm0
- movdqu %xmm0, 97(%rdi)
-L(bwd_write_97bytes):
- lddqu 81(%rsi), %xmm0
- movdqu %xmm0, 81(%rdi)
-L(bwd_write_81bytes):
- lddqu 65(%rsi), %xmm0
- movdqu %xmm0, 65(%rdi)
-L(bwd_write_65bytes):
- lddqu 49(%rsi), %xmm0
- movdqu %xmm0, 49(%rdi)
-L(bwd_write_49bytes):
- lddqu 33(%rsi), %xmm0
- movdqu %xmm0, 33(%rdi)
-L(bwd_write_33bytes):
- lddqu 17(%rsi), %xmm0
- movdqu %xmm0, 17(%rdi)
-L(bwd_write_17bytes):
- lddqu 1(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 1(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_1bytes):
- movzbl (%rsi), %edx
- mov %dl, (%rdi)
- ret
-
-END (MEMCPY)
-
- .section .rodata.ssse3,"a",@progbits
- .p2align 3
-L(table_144_bytes_bwd):
- .int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
-
- .p2align 3
-L(table_144_bytes_fwd):
- .int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
-
- .p2align 3
-L(shl_table_fwd):
- .int JMPTBL (L(shl_0), L(shl_table_fwd))
- .int JMPTBL (L(shl_1), L(shl_table_fwd))
- .int JMPTBL (L(shl_2), L(shl_table_fwd))
- .int JMPTBL (L(shl_3), L(shl_table_fwd))
- .int JMPTBL (L(shl_4), L(shl_table_fwd))
- .int JMPTBL (L(shl_5), L(shl_table_fwd))
- .int JMPTBL (L(shl_6), L(shl_table_fwd))
- .int JMPTBL (L(shl_7), L(shl_table_fwd))
- .int JMPTBL (L(shl_8), L(shl_table_fwd))
- .int JMPTBL (L(shl_9), L(shl_table_fwd))
- .int JMPTBL (L(shl_10), L(shl_table_fwd))
- .int JMPTBL (L(shl_11), L(shl_table_fwd))
- .int JMPTBL (L(shl_12), L(shl_table_fwd))
- .int JMPTBL (L(shl_13), L(shl_table_fwd))
- .int JMPTBL (L(shl_14), L(shl_table_fwd))
- .int JMPTBL (L(shl_15), L(shl_table_fwd))
-
- .p2align 3
-L(shl_table_bwd):
- .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
deleted file mode 100644
index f3ea52a46c..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ /dev/null
@@ -1,3150 +0,0 @@
-/* memcpy with SSSE3
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#if IS_IN (libc) \
- && (defined SHARED \
- || defined USE_AS_MEMMOVE \
- || !defined USE_MULTIARCH)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY __memcpy_ssse3
-# define MEMCPY_CHK __memcpy_chk_ssse3
-# define MEMPCPY __mempcpy_ssse3
-# define MEMPCPY_CHK __mempcpy_chk_ssse3
-#endif
-
-#define JMPTBL(I, B) I - B
-
-/* Branch to an entry in a jump table. TABLE is a jump table with
- relative offsets. INDEX is a register contains the index into the
- jump table. SCALE is the scale of INDEX. */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- lea TABLE(%rip), %r11; \
- movslq (%r11, INDEX, SCALE), INDEX; \
- lea (%r11, INDEX), INDEX; \
- jmp *INDEX; \
- ud2
-
- .section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
- movq %rdi, %rax
- addq %rdx, %rax
- jmp L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
- mov %rdi, %rax
-#ifdef USE_AS_MEMPCPY
- add %rdx, %rax
-#endif
-
-#ifdef USE_AS_MEMMOVE
- cmp %rsi, %rdi
- jb L(copy_forward)
- je L(write_0bytes)
- cmp $79, %rdx
- jbe L(copy_forward)
- jmp L(copy_backward)
-L(copy_forward):
-#endif
-L(start):
- cmp $79, %rdx
- lea L(table_less_80bytes)(%rip), %r11
- ja L(80bytesormore)
- movslq (%r11, %rdx, 4), %r9
- add %rdx, %rsi
- add %rdx, %rdi
- add %r11, %r9
- jmp *%r9
- ud2
-
- .p2align 4
-L(80bytesormore):
-#ifndef USE_AS_MEMMOVE
- cmp %dil, %sil
- jle L(copy_backward)
-#endif
-
- movdqu (%rsi), %xmm0
- mov %rdi, %rcx
- and $-16, %rdi
- add $16, %rdi
- mov %rcx, %r8
- sub %rdi, %rcx
- add %rcx, %rdx
- sub %rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
- cmp %rcx, %rdx
- mov %rsi, %r9
- ja L(large_page_fwd)
- and $0xf, %r9
- jz L(shl_0)
-#ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_data_cache_size_half(%rip), %RCX_LP
-#endif
- BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
-
- .p2align 4
-L(copy_backward):
- movdqu -16(%rsi, %rdx), %xmm0
- add %rdx, %rsi
- lea -16(%rdi, %rdx), %r8
- add %rdx, %rdi
-
- mov %rdi, %rcx
- and $0xf, %rcx
- xor %rcx, %rdi
- sub %rcx, %rdx
- sub %rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-
- cmp %rcx, %rdx
- mov %rsi, %r9
- ja L(large_page_bwd)
- and $0xf, %r9
- jz L(shl_0_bwd)
-#ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_data_cache_size_half(%rip), %RCX_LP
-#endif
- BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
-
- .p2align 4
-L(shl_0):
- sub $16, %rdx
- movdqa (%rsi), %xmm1
- add $16, %rsi
- movdqa %xmm1, (%rdi)
- add $16, %rdi
- cmp $128, %rdx
- movdqu %xmm0, (%r8)
- ja L(shl_0_gobble)
- cmp $64, %rdx
- jb L(shl_0_less_64bytes)
- movaps (%rsi), %xmm4
- movaps 16(%rsi), %xmm1
- movaps 32(%rsi), %xmm2
- movaps 48(%rsi), %xmm3
- movaps %xmm4, (%rdi)
- movaps %xmm1, 16(%rdi)
- movaps %xmm2, 32(%rdi)
- movaps %xmm3, 48(%rdi)
- sub $64, %rdx
- add $64, %rsi
- add $64, %rdi
-L(shl_0_less_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble):
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
- cmp __x86_data_cache_size_half(%rip), %RDX_LP
-#endif
- lea -128(%rdx), %rdx
- jae L(shl_0_gobble_mem_loop)
-L(shl_0_gobble_cache_loop):
- movdqa (%rsi), %xmm4
- movaps 0x10(%rsi), %xmm1
- movaps 0x20(%rsi), %xmm2
- movaps 0x30(%rsi), %xmm3
-
- movdqa %xmm4, (%rdi)
- movaps %xmm1, 0x10(%rdi)
- movaps %xmm2, 0x20(%rdi)
- movaps %xmm3, 0x30(%rdi)
-
- sub $128, %rdx
- movaps 0x40(%rsi), %xmm4
- movaps 0x50(%rsi), %xmm5
- movaps 0x60(%rsi), %xmm6
- movaps 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
- movaps %xmm4, 0x40(%rdi)
- movaps %xmm5, 0x50(%rdi)
- movaps %xmm6, 0x60(%rdi)
- movaps %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_cache_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_cache_less_64bytes)
-
- movdqa (%rsi), %xmm4
- sub $0x40, %rdx
- movdqa 0x10(%rsi), %xmm1
-
- movdqa %xmm4, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
-
- movdqa 0x20(%rsi), %xmm4
- movdqa 0x30(%rsi), %xmm1
- add $0x40, %rsi
-
- movdqa %xmm4, 0x20(%rdi)
- movdqa %xmm1, 0x30(%rdi)
- add $0x40, %rdi
-L(shl_0_cache_less_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble_mem_loop):
- prefetcht0 0x1c0(%rsi)
- prefetcht0 0x280(%rsi)
-
- movdqa (%rsi), %xmm0
- movdqa 0x10(%rsi), %xmm1
- movdqa 0x20(%rsi), %xmm2
- movdqa 0x30(%rsi), %xmm3
- movdqa 0x40(%rsi), %xmm4
- movdqa 0x50(%rsi), %xmm5
- movdqa 0x60(%rsi), %xmm6
- movdqa 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
- sub $0x80, %rdx
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
- movdqa %xmm2, 0x20(%rdi)
- movdqa %xmm3, 0x30(%rdi)
- movdqa %xmm4, 0x40(%rdi)
- movdqa %xmm5, 0x50(%rdi)
- movdqa %xmm6, 0x60(%rdi)
- movdqa %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_mem_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_mem_less_64bytes)
-
- movdqa (%rsi), %xmm0
- sub $0x40, %rdx
- movdqa 0x10(%rsi), %xmm1
-
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
-
- movdqa 0x20(%rsi), %xmm0
- movdqa 0x30(%rsi), %xmm1
- add $0x40, %rsi
-
- movdqa %xmm0, 0x20(%rdi)
- movdqa %xmm1, 0x30(%rdi)
- add $0x40, %rdi
-L(shl_0_mem_less_64bytes):
- cmp $0x20, %rdx
- jb L(shl_0_mem_less_32bytes)
- movdqa (%rsi), %xmm0
- sub $0x20, %rdx
- movdqa 0x10(%rsi), %xmm1
- add $0x20, %rsi
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
- add $0x20, %rdi
-L(shl_0_mem_less_32bytes):
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_bwd):
- sub $16, %rdx
- movdqa -0x10(%rsi), %xmm1
- sub $16, %rsi
- movdqa %xmm1, -0x10(%rdi)
- sub $16, %rdi
- cmp $0x80, %rdx
- movdqu %xmm0, (%r8)
- ja L(shl_0_gobble_bwd)
- cmp $64, %rdx
- jb L(shl_0_less_64bytes_bwd)
- movaps -0x10(%rsi), %xmm0
- movaps -0x20(%rsi), %xmm1
- movaps -0x30(%rsi), %xmm2
- movaps -0x40(%rsi), %xmm3
- movaps %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
- sub $64, %rdx
- sub $0x40, %rsi
- sub $0x40, %rdi
-L(shl_0_less_64bytes_bwd):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble_bwd):
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
- cmp __x86_data_cache_size_half(%rip), %RDX_LP
-#endif
- lea -128(%rdx), %rdx
- jae L(shl_0_gobble_mem_bwd_loop)
-L(shl_0_gobble_bwd_loop):
- movdqa -0x10(%rsi), %xmm0
- movaps -0x20(%rsi), %xmm1
- movaps -0x30(%rsi), %xmm2
- movaps -0x40(%rsi), %xmm3
-
- movdqa %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
-
- sub $0x80, %rdx
- movaps -0x50(%rsi), %xmm4
- movaps -0x60(%rsi), %xmm5
- movaps -0x70(%rsi), %xmm6
- movaps -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
- movaps %xmm4, -0x50(%rdi)
- movaps %xmm5, -0x60(%rdi)
- movaps %xmm6, -0x70(%rdi)
- movaps %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_bwd_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_gobble_bwd_less_64bytes)
-
- movdqa -0x10(%rsi), %xmm0
- sub $0x40, %rdx
- movdqa -0x20(%rsi), %xmm1
-
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
-
- movdqa -0x30(%rsi), %xmm0
- movdqa -0x40(%rsi), %xmm1
- sub $0x40, %rsi
-
- movdqa %xmm0, -0x30(%rdi)
- movdqa %xmm1, -0x40(%rdi)
- sub $0x40, %rdi
-L(shl_0_gobble_bwd_less_64bytes):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble_mem_bwd_loop):
- prefetcht0 -0x1c0(%rsi)
- prefetcht0 -0x280(%rsi)
- movdqa -0x10(%rsi), %xmm0
- movdqa -0x20(%rsi), %xmm1
- movdqa -0x30(%rsi), %xmm2
- movdqa -0x40(%rsi), %xmm3
- movdqa -0x50(%rsi), %xmm4
- movdqa -0x60(%rsi), %xmm5
- movdqa -0x70(%rsi), %xmm6
- movdqa -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
- sub $0x80, %rdx
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
- movdqa %xmm2, -0x30(%rdi)
- movdqa %xmm3, -0x40(%rdi)
- movdqa %xmm4, -0x50(%rdi)
- movdqa %xmm5, -0x60(%rdi)
- movdqa %xmm6, -0x70(%rdi)
- movdqa %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_mem_bwd_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_mem_bwd_less_64bytes)
-
- movdqa -0x10(%rsi), %xmm0
- sub $0x40, %rdx
- movdqa -0x20(%rsi), %xmm1
-
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
-
- movdqa -0x30(%rsi), %xmm0
- movdqa -0x40(%rsi), %xmm1
- sub $0x40, %rsi
-
- movdqa %xmm0, -0x30(%rdi)
- movdqa %xmm1, -0x40(%rdi)
- sub $0x40, %rdi
-L(shl_0_mem_bwd_less_64bytes):
- cmp $0x20, %rdx
- jb L(shl_0_mem_bwd_less_32bytes)
- movdqa -0x10(%rsi), %xmm0
- sub $0x20, %rdx
- movdqa -0x20(%rsi), %xmm1
- sub $0x20, %rsi
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
- sub $0x20, %rdi
-L(shl_0_mem_bwd_less_32bytes):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_1):
- lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x01(%rsi), %xmm1
- jb L(L1_fwd)
- lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
-L(L1_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_1_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_1_loop_L1):
- sub $64, %rdx
- movaps 0x0f(%rsi), %xmm2
- movaps 0x1f(%rsi), %xmm3
- movaps 0x2f(%rsi), %xmm4
- movaps 0x3f(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $1, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $1, %xmm3, %xmm4
- palignr $1, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $1, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_1_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_1_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_1_bwd):
- lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x01(%rsi), %xmm1
- jb L(L1_bwd)
- lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
-L(L1_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_1_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_1_bwd_loop_L1):
- movaps -0x11(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x21(%rsi), %xmm3
- movaps -0x31(%rsi), %xmm4
- movaps -0x41(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $1, %xmm2, %xmm1
- palignr $1, %xmm3, %xmm2
- palignr $1, %xmm4, %xmm3
- palignr $1, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_1_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_1_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_2):
- lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x02(%rsi), %xmm1
- jb L(L2_fwd)
- lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
-L(L2_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_2_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_2_loop_L1):
- sub $64, %rdx
- movaps 0x0e(%rsi), %xmm2
- movaps 0x1e(%rsi), %xmm3
- movaps 0x2e(%rsi), %xmm4
- movaps 0x3e(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $2, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $2, %xmm3, %xmm4
- palignr $2, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $2, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_2_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_2_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_2_bwd):
- lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x02(%rsi), %xmm1
- jb L(L2_bwd)
- lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
-L(L2_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_2_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_2_bwd_loop_L1):
- movaps -0x12(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x22(%rsi), %xmm3
- movaps -0x32(%rsi), %xmm4
- movaps -0x42(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $2, %xmm2, %xmm1
- palignr $2, %xmm3, %xmm2
- palignr $2, %xmm4, %xmm3
- palignr $2, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_2_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_2_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_3):
- lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x03(%rsi), %xmm1
- jb L(L3_fwd)
- lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
-L(L3_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_3_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_3_loop_L1):
- sub $64, %rdx
- movaps 0x0d(%rsi), %xmm2
- movaps 0x1d(%rsi), %xmm3
- movaps 0x2d(%rsi), %xmm4
- movaps 0x3d(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $3, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $3, %xmm3, %xmm4
- palignr $3, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $3, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_3_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_3_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_3_bwd):
- lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x03(%rsi), %xmm1
- jb L(L3_bwd)
- lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
-L(L3_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_3_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_3_bwd_loop_L1):
- movaps -0x13(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x23(%rsi), %xmm3
- movaps -0x33(%rsi), %xmm4
- movaps -0x43(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $3, %xmm2, %xmm1
- palignr $3, %xmm3, %xmm2
- palignr $3, %xmm4, %xmm3
- palignr $3, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_3_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_3_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_4):
- lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x04(%rsi), %xmm1
- jb L(L4_fwd)
- lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
-L(L4_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_4_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_4_loop_L1):
- sub $64, %rdx
- movaps 0x0c(%rsi), %xmm2
- movaps 0x1c(%rsi), %xmm3
- movaps 0x2c(%rsi), %xmm4
- movaps 0x3c(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $4, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $4, %xmm3, %xmm4
- palignr $4, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $4, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_4_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_4_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_4_bwd):
- lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x04(%rsi), %xmm1
- jb L(L4_bwd)
- lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
-L(L4_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_4_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_4_bwd_loop_L1):
- movaps -0x14(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x24(%rsi), %xmm3
- movaps -0x34(%rsi), %xmm4
- movaps -0x44(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $4, %xmm2, %xmm1
- palignr $4, %xmm3, %xmm2
- palignr $4, %xmm4, %xmm3
- palignr $4, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_4_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_4_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_5):
- lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x05(%rsi), %xmm1
- jb L(L5_fwd)
- lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
-L(L5_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_5_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_5_loop_L1):
- sub $64, %rdx
- movaps 0x0b(%rsi), %xmm2
- movaps 0x1b(%rsi), %xmm3
- movaps 0x2b(%rsi), %xmm4
- movaps 0x3b(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $5, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $5, %xmm3, %xmm4
- palignr $5, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $5, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_5_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_5_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_5_bwd):
- lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x05(%rsi), %xmm1
- jb L(L5_bwd)
- lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
-L(L5_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_5_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_5_bwd_loop_L1):
- movaps -0x15(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x25(%rsi), %xmm3
- movaps -0x35(%rsi), %xmm4
- movaps -0x45(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $5, %xmm2, %xmm1
- palignr $5, %xmm3, %xmm2
- palignr $5, %xmm4, %xmm3
- palignr $5, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_5_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_5_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_6):
- lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x06(%rsi), %xmm1
- jb L(L6_fwd)
- lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
-L(L6_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_6_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_6_loop_L1):
- sub $64, %rdx
- movaps 0x0a(%rsi), %xmm2
- movaps 0x1a(%rsi), %xmm3
- movaps 0x2a(%rsi), %xmm4
- movaps 0x3a(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $6, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $6, %xmm3, %xmm4
- palignr $6, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $6, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_6_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_6_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_6_bwd):
- lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x06(%rsi), %xmm1
- jb L(L6_bwd)
- lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
-L(L6_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_6_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_6_bwd_loop_L1):
- movaps -0x16(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x26(%rsi), %xmm3
- movaps -0x36(%rsi), %xmm4
- movaps -0x46(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $6, %xmm2, %xmm1
- palignr $6, %xmm3, %xmm2
- palignr $6, %xmm4, %xmm3
- palignr $6, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_6_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_6_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_7):
- lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x07(%rsi), %xmm1
- jb L(L7_fwd)
- lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
-L(L7_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_7_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_7_loop_L1):
- sub $64, %rdx
- movaps 0x09(%rsi), %xmm2
- movaps 0x19(%rsi), %xmm3
- movaps 0x29(%rsi), %xmm4
- movaps 0x39(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $7, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $7, %xmm3, %xmm4
- palignr $7, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $7, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_7_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_7_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_7_bwd):
- lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x07(%rsi), %xmm1
- jb L(L7_bwd)
- lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
-L(L7_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_7_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_7_bwd_loop_L1):
- movaps -0x17(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x27(%rsi), %xmm3
- movaps -0x37(%rsi), %xmm4
- movaps -0x47(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $7, %xmm2, %xmm1
- palignr $7, %xmm3, %xmm2
- palignr $7, %xmm4, %xmm3
- palignr $7, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_7_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_7_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_8):
- lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x08(%rsi), %xmm1
- jb L(L8_fwd)
- lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
-L(L8_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
-L(shl_8_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_8_loop_L1):
- sub $64, %rdx
- movaps 0x08(%rsi), %xmm2
- movaps 0x18(%rsi), %xmm3
- movaps 0x28(%rsi), %xmm4
- movaps 0x38(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $8, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $8, %xmm3, %xmm4
- palignr $8, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $8, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_8_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
- .p2align 4
-L(shl_8_end):
- lea 64(%rdx), %rdx
- movaps %xmm4, -0x20(%rdi)
- add %rdx, %rsi
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_8_bwd):
- lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x08(%rsi), %xmm1
- jb L(L8_bwd)
- lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
-L(L8_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_8_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_8_bwd_loop_L1):
- movaps -0x18(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x28(%rsi), %xmm3
- movaps -0x38(%rsi), %xmm4
- movaps -0x48(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $8, %xmm2, %xmm1
- palignr $8, %xmm3, %xmm2
- palignr $8, %xmm4, %xmm3
- palignr $8, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_8_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_8_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_9):
- lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x09(%rsi), %xmm1
- jb L(L9_fwd)
- lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
-L(L9_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_9_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_9_loop_L1):
- sub $64, %rdx
- movaps 0x07(%rsi), %xmm2
- movaps 0x17(%rsi), %xmm3
- movaps 0x27(%rsi), %xmm4
- movaps 0x37(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $9, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $9, %xmm3, %xmm4
- palignr $9, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $9, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_9_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_9_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_9_bwd):
- lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x09(%rsi), %xmm1
- jb L(L9_bwd)
- lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
-L(L9_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_9_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_9_bwd_loop_L1):
- movaps -0x19(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x29(%rsi), %xmm3
- movaps -0x39(%rsi), %xmm4
- movaps -0x49(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $9, %xmm2, %xmm1
- palignr $9, %xmm3, %xmm2
- palignr $9, %xmm4, %xmm3
- palignr $9, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_9_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_9_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_10):
- lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0a(%rsi), %xmm1
- jb L(L10_fwd)
- lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
-L(L10_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_10_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_10_loop_L1):
- sub $64, %rdx
- movaps 0x06(%rsi), %xmm2
- movaps 0x16(%rsi), %xmm3
- movaps 0x26(%rsi), %xmm4
- movaps 0x36(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $10, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $10, %xmm3, %xmm4
- palignr $10, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $10, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_10_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_10_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_10_bwd):
- lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0a(%rsi), %xmm1
- jb L(L10_bwd)
- lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
-L(L10_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_10_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_10_bwd_loop_L1):
- movaps -0x1a(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2a(%rsi), %xmm3
- movaps -0x3a(%rsi), %xmm4
- movaps -0x4a(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $10, %xmm2, %xmm1
- palignr $10, %xmm3, %xmm2
- palignr $10, %xmm4, %xmm3
- palignr $10, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_10_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_10_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_11):
- lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0b(%rsi), %xmm1
- jb L(L11_fwd)
- lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
-L(L11_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_11_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_11_loop_L1):
- sub $64, %rdx
- movaps 0x05(%rsi), %xmm2
- movaps 0x15(%rsi), %xmm3
- movaps 0x25(%rsi), %xmm4
- movaps 0x35(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $11, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $11, %xmm3, %xmm4
- palignr $11, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $11, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_11_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_11_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_11_bwd):
- lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0b(%rsi), %xmm1
- jb L(L11_bwd)
- lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
-L(L11_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_11_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_11_bwd_loop_L1):
- movaps -0x1b(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2b(%rsi), %xmm3
- movaps -0x3b(%rsi), %xmm4
- movaps -0x4b(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $11, %xmm2, %xmm1
- palignr $11, %xmm3, %xmm2
- palignr $11, %xmm4, %xmm3
- palignr $11, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_11_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_11_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_12):
- lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0c(%rsi), %xmm1
- jb L(L12_fwd)
- lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
-L(L12_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_12_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_12_loop_L1):
- sub $64, %rdx
- movaps 0x04(%rsi), %xmm2
- movaps 0x14(%rsi), %xmm3
- movaps 0x24(%rsi), %xmm4
- movaps 0x34(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $12, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $12, %xmm3, %xmm4
- palignr $12, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $12, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_12_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_12_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_12_bwd):
- lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0c(%rsi), %xmm1
- jb L(L12_bwd)
- lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
-L(L12_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_12_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_12_bwd_loop_L1):
- movaps -0x1c(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2c(%rsi), %xmm3
- movaps -0x3c(%rsi), %xmm4
- movaps -0x4c(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $12, %xmm2, %xmm1
- palignr $12, %xmm3, %xmm2
- palignr $12, %xmm4, %xmm3
- palignr $12, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_12_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_12_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_13):
- lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0d(%rsi), %xmm1
- jb L(L13_fwd)
- lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
-L(L13_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_13_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_13_loop_L1):
- sub $64, %rdx
- movaps 0x03(%rsi), %xmm2
- movaps 0x13(%rsi), %xmm3
- movaps 0x23(%rsi), %xmm4
- movaps 0x33(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $13, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $13, %xmm3, %xmm4
- palignr $13, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $13, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_13_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_13_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_13_bwd):
- lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0d(%rsi), %xmm1
- jb L(L13_bwd)
- lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
-L(L13_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_13_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_13_bwd_loop_L1):
- movaps -0x1d(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2d(%rsi), %xmm3
- movaps -0x3d(%rsi), %xmm4
- movaps -0x4d(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $13, %xmm2, %xmm1
- palignr $13, %xmm3, %xmm2
- palignr $13, %xmm4, %xmm3
- palignr $13, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_13_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_13_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_14):
- lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0e(%rsi), %xmm1
- jb L(L14_fwd)
- lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
-L(L14_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_14_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_14_loop_L1):
- sub $64, %rdx
- movaps 0x02(%rsi), %xmm2
- movaps 0x12(%rsi), %xmm3
- movaps 0x22(%rsi), %xmm4
- movaps 0x32(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $14, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $14, %xmm3, %xmm4
- palignr $14, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $14, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_14_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_14_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_14_bwd):
- lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0e(%rsi), %xmm1
- jb L(L14_bwd)
- lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
-L(L14_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_14_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_14_bwd_loop_L1):
- movaps -0x1e(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2e(%rsi), %xmm3
- movaps -0x3e(%rsi), %xmm4
- movaps -0x4e(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $14, %xmm2, %xmm1
- palignr $14, %xmm3, %xmm2
- palignr $14, %xmm4, %xmm3
- palignr $14, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_14_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_14_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_15):
- lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0f(%rsi), %xmm1
- jb L(L15_fwd)
- lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
-L(L15_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_15_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_15_loop_L1):
- sub $64, %rdx
- movaps 0x01(%rsi), %xmm2
- movaps 0x11(%rsi), %xmm3
- movaps 0x21(%rsi), %xmm4
- movaps 0x31(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $15, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $15, %xmm3, %xmm4
- palignr $15, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $15, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_15_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_15_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_15_bwd):
- lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0f(%rsi), %xmm1
- jb L(L15_bwd)
- lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
-L(L15_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_15_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_15_bwd_loop_L1):
- movaps -0x1f(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2f(%rsi), %xmm3
- movaps -0x3f(%rsi), %xmm4
- movaps -0x4f(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $15, %xmm2, %xmm1
- palignr $15, %xmm3, %xmm2
- palignr $15, %xmm4, %xmm3
- palignr $15, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_15_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_15_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(write_72bytes):
- movdqu -72(%rsi), %xmm0
- movdqu -56(%rsi), %xmm1
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rcx
- movdqu %xmm0, -72(%rdi)
- movdqu %xmm1, -56(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_64bytes):
- movdqu -64(%rsi), %xmm0
- mov -48(%rsi), %rcx
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- movdqu %xmm0, -64(%rdi)
- mov %rcx, -48(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_56bytes):
- movdqu -56(%rsi), %xmm0
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rcx
- movdqu %xmm0, -56(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_48bytes):
- mov -48(%rsi), %rcx
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %rcx, -48(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_40bytes):
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_32bytes):
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_24bytes):
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_16bytes):
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_8bytes):
- mov -8(%rsi), %rdx
- mov %rdx, -8(%rdi)
-L(write_0bytes):
- ret
-
- .p2align 4
-L(write_73bytes):
- movdqu -73(%rsi), %xmm0
- movdqu -57(%rsi), %xmm1
- mov -41(%rsi), %rcx
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %r8
- mov -4(%rsi), %edx
- movdqu %xmm0, -73(%rdi)
- movdqu %xmm1, -57(%rdi)
- mov %rcx, -41(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %r8, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_65bytes):
- movdqu -65(%rsi), %xmm0
- movdqu -49(%rsi), %xmm1
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -65(%rdi)
- movdqu %xmm1, -49(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_57bytes):
- movdqu -57(%rsi), %xmm0
- mov -41(%rsi), %r8
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -57(%rdi)
- mov %r8, -41(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_49bytes):
- movdqu -49(%rsi), %xmm0
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -49(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_41bytes):
- mov -41(%rsi), %r8
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -1(%rsi), %dl
- mov %r8, -41(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_33bytes):
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -1(%rsi), %dl
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_25bytes):
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -1(%rsi), %dl
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_17bytes):
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_9bytes):
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_1bytes):
- mov -1(%rsi), %dl
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_74bytes):
- movdqu -74(%rsi), %xmm0
- movdqu -58(%rsi), %xmm1
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -74(%rdi)
- movdqu %xmm1, -58(%rdi)
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_66bytes):
- movdqu -66(%rsi), %xmm0
- movdqu -50(%rsi), %xmm1
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -66(%rdi)
- movdqu %xmm1, -50(%rdi)
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_58bytes):
- movdqu -58(%rsi), %xmm1
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm1, -58(%rdi)
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_50bytes):
- movdqu -50(%rsi), %xmm0
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -50(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_42bytes):
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_34bytes):
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_26bytes):
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_18bytes):
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_10bytes):
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_2bytes):
- mov -2(%rsi), %dx
- mov %dx, -2(%rdi)
- ret
-
- .p2align 4
-L(write_75bytes):
- movdqu -75(%rsi), %xmm0
- movdqu -59(%rsi), %xmm1
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -75(%rdi)
- movdqu %xmm1, -59(%rdi)
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_67bytes):
- movdqu -67(%rsi), %xmm0
- movdqu -59(%rsi), %xmm1
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -67(%rdi)
- movdqu %xmm1, -59(%rdi)
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_59bytes):
- movdqu -59(%rsi), %xmm0
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -59(%rdi)
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_51bytes):
- movdqu -51(%rsi), %xmm0
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -51(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_43bytes):
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_35bytes):
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_27bytes):
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_19bytes):
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_11bytes):
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_3bytes):
- mov -3(%rsi), %dx
- mov -2(%rsi), %cx
- mov %dx, -3(%rdi)
- mov %cx, -2(%rdi)
- ret
-
- .p2align 4
-L(write_76bytes):
- movdqu -76(%rsi), %xmm0
- movdqu -60(%rsi), %xmm1
- mov -44(%rsi), %r8
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -76(%rdi)
- movdqu %xmm1, -60(%rdi)
- mov %r8, -44(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_68bytes):
- movdqu -68(%rsi), %xmm0
- movdqu -52(%rsi), %xmm1
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -68(%rdi)
- movdqu %xmm1, -52(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_60bytes):
- movdqu -60(%rsi), %xmm0
- mov -44(%rsi), %r8
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -60(%rdi)
- mov %r8, -44(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_52bytes):
- movdqu -52(%rsi), %xmm0
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -52(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_44bytes):
- mov -44(%rsi), %r8
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r8, -44(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_36bytes):
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_28bytes):
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_20bytes):
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_12bytes):
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_4bytes):
- mov -4(%rsi), %edx
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_77bytes):
- movdqu -77(%rsi), %xmm0
- movdqu -61(%rsi), %xmm1
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -77(%rdi)
- movdqu %xmm1, -61(%rdi)
- mov %r8, -45(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_69bytes):
- movdqu -69(%rsi), %xmm0
- movdqu -53(%rsi), %xmm1
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -69(%rdi)
- movdqu %xmm1, -53(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_61bytes):
- movdqu -61(%rsi), %xmm0
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -61(%rdi)
- mov %r8, -45(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_53bytes):
- movdqu -53(%rsi), %xmm0
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -53(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_45bytes):
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r8, -45(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_37bytes):
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_29bytes):
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_21bytes):
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_13bytes):
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_5bytes):
- mov -5(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -5(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_78bytes):
- movdqu -78(%rsi), %xmm0
- movdqu -62(%rsi), %xmm1
- mov -46(%rsi), %r8
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -78(%rdi)
- movdqu %xmm1, -62(%rdi)
- mov %r8, -46(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_70bytes):
- movdqu -70(%rsi), %xmm0
- movdqu -54(%rsi), %xmm1
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -70(%rdi)
- movdqu %xmm1, -54(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_62bytes):
- movdqu -62(%rsi), %xmm0
- mov -46(%rsi), %r8
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -62(%rdi)
- mov %r8, -46(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_54bytes):
- movdqu -54(%rsi), %xmm0
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -54(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_46bytes):
- mov -46(%rsi), %r8
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r8, -46(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_38bytes):
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_30bytes):
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_22bytes):
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_14bytes):
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_6bytes):
- mov -6(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -6(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_79bytes):
- movdqu -79(%rsi), %xmm0
- movdqu -63(%rsi), %xmm1
- mov -47(%rsi), %r8
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -79(%rdi)
- movdqu %xmm1, -63(%rdi)
- mov %r8, -47(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_71bytes):
- movdqu -71(%rsi), %xmm0
- movdqu -55(%rsi), %xmm1
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -71(%rdi)
- movdqu %xmm1, -55(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_63bytes):
- movdqu -63(%rsi), %xmm0
- mov -47(%rsi), %r8
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -63(%rdi)
- mov %r8, -47(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_55bytes):
- movdqu -55(%rsi), %xmm0
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -55(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_47bytes):
- mov -47(%rsi), %r8
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r8, -47(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_39bytes):
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_31bytes):
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_23bytes):
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_15bytes):
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_7bytes):
- mov -7(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -7(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(large_page_fwd):
- movdqu (%rsi), %xmm1
- lea 16(%rsi), %rsi
- movdqu %xmm0, (%r8)
- movntdq %xmm1, (%rdi)
- lea 16(%rdi), %rdi
- lea -0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
- mov %rsi, %r9
- sub %rdi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_fwd)
- shl $2, %rcx
- cmp %rcx, %rdx
- jb L(ll_cache_copy_fwd_start)
-L(memmove_is_memcpy_fwd):
-#endif
-L(large_page_loop):
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movntdq %xmm0, (%rdi)
- movntdq %xmm1, 0x10(%rdi)
- movntdq %xmm2, 0x20(%rdi)
- movntdq %xmm3, 0x30(%rdi)
- movntdq %xmm4, 0x40(%rdi)
- movntdq %xmm5, 0x50(%rdi)
- movntdq %xmm6, 0x60(%rdi)
- movntdq %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
- jae L(large_page_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_less_64bytes)
-
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- lea 0x40(%rsi), %rsi
-
- movntdq %xmm0, (%rdi)
- movntdq %xmm1, 0x10(%rdi)
- movntdq %xmm2, 0x20(%rdi)
- movntdq %xmm3, 0x30(%rdi)
- lea 0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_less_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- sfence
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
- .p2align 4
-L(ll_cache_copy_fwd_start):
- prefetcht0 0x1c0(%rsi)
- prefetcht0 0x200(%rsi)
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movaps %xmm0, (%rdi)
- movaps %xmm1, 0x10(%rdi)
- movaps %xmm2, 0x20(%rdi)
- movaps %xmm3, 0x30(%rdi)
- movaps %xmm4, 0x40(%rdi)
- movaps %xmm5, 0x50(%rdi)
- movaps %xmm6, 0x60(%rdi)
- movaps %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
- jae L(ll_cache_copy_fwd_start)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_ll_less_fwd_64bytes)
-
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- lea 0x40(%rsi), %rsi
-
- movaps %xmm0, (%rdi)
- movaps %xmm1, 0x10(%rdi)
- movaps %xmm2, 0x20(%rdi)
- movaps %xmm3, 0x30(%rdi)
- lea 0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_ll_less_fwd_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#endif
- .p2align 4
-L(large_page_bwd):
- movdqu -0x10(%rsi), %xmm1
- lea -16(%rsi), %rsi
- movdqu %xmm0, (%r8)
- movdqa %xmm1, -0x10(%rdi)
- lea -16(%rdi), %rdi
- lea -0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
- mov %rdi, %r9
- sub %rsi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_bwd)
- cmp %rcx, %r9
- jb L(ll_cache_copy_bwd_start)
-L(memmove_is_memcpy_bwd):
-#endif
-L(large_page_bwd_loop):
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- movdqu -0x50(%rsi), %xmm4
- movdqu -0x60(%rsi), %xmm5
- movdqu -0x70(%rsi), %xmm6
- movdqu -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movntdq %xmm0, -0x10(%rdi)
- movntdq %xmm1, -0x20(%rdi)
- movntdq %xmm2, -0x30(%rdi)
- movntdq %xmm3, -0x40(%rdi)
- movntdq %xmm4, -0x50(%rdi)
- movntdq %xmm5, -0x60(%rdi)
- movntdq %xmm6, -0x70(%rdi)
- movntdq %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
- jae L(large_page_bwd_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_less_bwd_64bytes)
-
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- lea -0x40(%rsi), %rsi
-
- movntdq %xmm0, -0x10(%rdi)
- movntdq %xmm1, -0x20(%rdi)
- movntdq %xmm2, -0x30(%rdi)
- movntdq %xmm3, -0x40(%rdi)
- lea -0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_less_bwd_64bytes):
- sfence
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
- .p2align 4
-L(ll_cache_copy_bwd_start):
- prefetcht0 -0x1c0(%rsi)
- prefetcht0 -0x200(%rsi)
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- movdqu -0x50(%rsi), %xmm4
- movdqu -0x60(%rsi), %xmm5
- movdqu -0x70(%rsi), %xmm6
- movdqu -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movaps %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
- movaps %xmm4, -0x50(%rdi)
- movaps %xmm5, -0x60(%rdi)
- movaps %xmm6, -0x70(%rdi)
- movaps %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
- jae L(ll_cache_copy_bwd_start)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_ll_less_bwd_64bytes)
-
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- lea -0x40(%rsi), %rsi
-
- movaps %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
- lea -0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_ll_less_bwd_64bytes):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-#endif
-
-END (MEMCPY)
-
- .section .rodata.ssse3,"a",@progbits
- .p2align 3
-L(table_less_80bytes):
- .int JMPTBL (L(write_0bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_1bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_2bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_3bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_4bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_5bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_6bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_7bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_8bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_9bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_10bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_11bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_12bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_13bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_14bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_15bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_16bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_17bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_18bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_19bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_20bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_21bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_22bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_23bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_24bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_25bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_26bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_27bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_28bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_29bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_30bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_31bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_32bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_33bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_34bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_35bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_36bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_37bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_38bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_39bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_40bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_41bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_42bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_43bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_44bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_45bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_46bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_47bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_48bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_49bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_50bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_51bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_52bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_53bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_54bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_55bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_56bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_57bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_58bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_59bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_60bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_61bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_62bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_63bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_64bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_65bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_66bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_67bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_68bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_69bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_70bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_71bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_72bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_73bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_74bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_75bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_76bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_77bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_78bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_79bytes), L(table_less_80bytes))
-
- .p2align 3
-L(shl_table):
- .int JMPTBL (L(shl_0), L(shl_table))
- .int JMPTBL (L(shl_1), L(shl_table))
- .int JMPTBL (L(shl_2), L(shl_table))
- .int JMPTBL (L(shl_3), L(shl_table))
- .int JMPTBL (L(shl_4), L(shl_table))
- .int JMPTBL (L(shl_5), L(shl_table))
- .int JMPTBL (L(shl_6), L(shl_table))
- .int JMPTBL (L(shl_7), L(shl_table))
- .int JMPTBL (L(shl_8), L(shl_table))
- .int JMPTBL (L(shl_9), L(shl_table))
- .int JMPTBL (L(shl_10), L(shl_table))
- .int JMPTBL (L(shl_11), L(shl_table))
- .int JMPTBL (L(shl_12), L(shl_table))
- .int JMPTBL (L(shl_13), L(shl_table))
- .int JMPTBL (L(shl_14), L(shl_table))
- .int JMPTBL (L(shl_15), L(shl_table))
-
- .p2align 3
-L(shl_table_bwd):
- .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
deleted file mode 100644
index af2770397c..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Multiple versions of memcpy
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in lib and for
- DSO. In static binaries we need memcpy before the initialization
- happened. */
-#if defined SHARED && IS_IN (libc)
- .text
-ENTRY(__new_memcpy)
- .type __new_memcpy, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- lea __memcpy_erms(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Prefer_ERMS)
- jnz 2f
- HAS_ARCH_FEATURE (Prefer_No_AVX512)
- jnz 1f
- HAS_ARCH_FEATURE (AVX512F_Usable)
- jz 1f
- lea __memcpy_avx512_no_vzeroupper(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
- jnz 2f
- lea __memcpy_avx512_unaligned_erms(%rip), %RAX_LP
- HAS_CPU_FEATURE (ERMS)
- jnz 2f
- lea __memcpy_avx512_unaligned(%rip), %RAX_LP
- ret
-1: lea __memcpy_avx_unaligned(%rip), %RAX_LP
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz L(Fast_Unaligned_Load)
- HAS_CPU_FEATURE (ERMS)
- jz 2f
- lea __memcpy_avx_unaligned_erms(%rip), %RAX_LP
- ret
-L(Fast_Unaligned_Load):
- lea __memcpy_sse2_unaligned(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
- jz L(SSSE3)
- HAS_CPU_FEATURE (ERMS)
- jz 2f
- lea __memcpy_sse2_unaligned_erms(%rip), %RAX_LP
- ret
-L(SSSE3):
- HAS_CPU_FEATURE (SSSE3)
- jz 2f
- lea __memcpy_ssse3_back(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jnz 2f
- lea __memcpy_ssse3(%rip), %RAX_LP
-2: ret
-END(__new_memcpy)
-
-# undef memcpy
-# include <shlib-compat.h>
-versioned_symbol (libc, __new_memcpy, memcpy, GLIBC_2_14);
-#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S
deleted file mode 100644
index 8737fb9755..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy_chk.S
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Multiple versions of __memcpy_chk
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in lib and for
- DSO. There are no multiarch memcpy functions for static binaries.
- */
-#if IS_IN (libc)
-# ifdef SHARED
- .text
-ENTRY(__memcpy_chk)
- .type __memcpy_chk, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- HAS_ARCH_FEATURE (Prefer_No_AVX512)
- jnz 1f
- HAS_ARCH_FEATURE (AVX512F_Usable)
- jz 1f
- lea __memcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
- jnz 2f
- lea __memcpy_chk_avx512_unaligned_erms(%rip), %RAX_LP
- HAS_CPU_FEATURE (ERMS)
- jnz 2f
- lea __memcpy_chk_avx512_unaligned(%rip), %RAX_LP
- ret
-1: lea __memcpy_chk_avx_unaligned(%rip), %RAX_LP
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz L(Fast_Unaligned_Load)
- HAS_CPU_FEATURE (ERMS)
- jz 2f
- lea __memcpy_chk_avx_unaligned_erms(%rip), %RAX_LP
- ret
-L(Fast_Unaligned_Load):
- lea __memcpy_chk_sse2_unaligned(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
- jz L(SSSE3)
- HAS_CPU_FEATURE (ERMS)
- jz 2f
- lea __memcpy_chk_sse2_unaligned_erms(%rip), %RAX_LP
- ret
-L(SSSE3):
- HAS_CPU_FEATURE (SSSE3)
- jz 2f
- lea __memcpy_chk_ssse3_back(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jnz 2f
- lea __memcpy_chk_ssse3(%rip), %RAX_LP
-2: ret
-END(__memcpy_chk)
-# else
-# include "../memcpy_chk.S"
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
deleted file mode 100644
index e195e93f15..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+++ /dev/null
@@ -1,12 +0,0 @@
-#if IS_IN (libc)
-# define VEC_SIZE 32
-# define VEC(i) ymm##i
-# define VMOVNT vmovntdq
-# define VMOVU vmovdqu
-# define VMOVA vmovdqa
-
-# define SECTION(p) p##.avx
-# define MEMMOVE_SYMBOL(p,s) p##_avx_##s
-
-# include "memmove-vec-unaligned-erms.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
deleted file mode 100644
index f3ef10577c..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
+++ /dev/null
@@ -1,420 +0,0 @@
-/* memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware.
- Copyright (C) 2016-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-# include "asm-syntax.h"
-
- .section .text.avx512,"ax",@progbits
-# if defined SHARED && !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (__mempcpy_chk_avx512_no_vzeroupper)
-
-ENTRY (__mempcpy_avx512_no_vzeroupper)
- movq %rdi, %rax
- addq %rdx, %rax
- jmp L(start)
-END (__mempcpy_avx512_no_vzeroupper)
-# endif
-
-# ifdef SHARED
-ENTRY (__memmove_chk_avx512_no_vzeroupper)
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (__memmove_chk_avx512_no_vzeroupper)
-# endif
-
-ENTRY (__memmove_avx512_no_vzeroupper)
- mov %rdi, %rax
-# ifdef USE_AS_MEMPCPY
- add %rdx, %rax
-# endif
-L(start):
- lea (%rsi, %rdx), %rcx
- lea (%rdi, %rdx), %r9
- cmp $512, %rdx
- ja L(512bytesormore)
-
-L(check):
- cmp $16, %rdx
- jbe L(less_16bytes)
- cmp $256, %rdx
- jb L(less_256bytes)
- vmovups (%rsi), %zmm0
- vmovups 0x40(%rsi), %zmm1
- vmovups 0x80(%rsi), %zmm2
- vmovups 0xC0(%rsi), %zmm3
- vmovups -0x100(%rcx), %zmm4
- vmovups -0xC0(%rcx), %zmm5
- vmovups -0x80(%rcx), %zmm6
- vmovups -0x40(%rcx), %zmm7
- vmovups %zmm0, (%rdi)
- vmovups %zmm1, 0x40(%rdi)
- vmovups %zmm2, 0x80(%rdi)
- vmovups %zmm3, 0xC0(%rdi)
- vmovups %zmm4, -0x100(%r9)
- vmovups %zmm5, -0xC0(%r9)
- vmovups %zmm6, -0x80(%r9)
- vmovups %zmm7, -0x40(%r9)
- ret
-
-L(less_256bytes):
- cmp $128, %dl
- jb L(less_128bytes)
- vmovups (%rsi), %zmm0
- vmovups 0x40(%rsi), %zmm1
- vmovups -0x80(%rcx), %zmm2
- vmovups -0x40(%rcx), %zmm3
- vmovups %zmm0, (%rdi)
- vmovups %zmm1, 0x40(%rdi)
- vmovups %zmm2, -0x80(%r9)
- vmovups %zmm3, -0x40(%r9)
- ret
-
-L(less_128bytes):
- cmp $64, %dl
- jb L(less_64bytes)
- vmovdqu (%rsi), %ymm0
- vmovdqu 0x20(%rsi), %ymm1
- vmovdqu -0x40(%rcx), %ymm2
- vmovdqu -0x20(%rcx), %ymm3
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, 0x20(%rdi)
- vmovdqu %ymm2, -0x40(%r9)
- vmovdqu %ymm3, -0x20(%r9)
- ret
-
-L(less_64bytes):
- cmp $32, %dl
- jb L(less_32bytes)
- vmovdqu (%rsi), %ymm0
- vmovdqu -0x20(%rcx), %ymm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, -0x20(%r9)
- ret
-
-L(less_32bytes):
- vmovdqu (%rsi), %xmm0
- vmovdqu -0x10(%rcx), %xmm1
- vmovdqu %xmm0, (%rdi)
- vmovdqu %xmm1, -0x10(%r9)
- ret
-
-L(less_16bytes):
- cmp $8, %dl
- jb L(less_8bytes)
- movq (%rsi), %rsi
- movq -0x8(%rcx), %rcx
- movq %rsi, (%rdi)
- movq %rcx, -0x8(%r9)
- ret
-
-L(less_8bytes):
- cmp $4, %dl
- jb L(less_4bytes)
- mov (%rsi), %esi
- mov -0x4(%rcx), %ecx
- mov %esi, (%rdi)
- mov %ecx, -0x4(%r9)
- ret
-
-L(less_4bytes):
- cmp $2, %dl
- jb L(less_2bytes)
- mov (%rsi), %si
- mov -0x2(%rcx), %cx
- mov %si, (%rdi)
- mov %cx, -0x2(%r9)
- ret
-
-L(less_2bytes):
- cmp $1, %dl
- jb L(less_1bytes)
- mov (%rsi), %cl
- mov %cl, (%rdi)
-L(less_1bytes):
- ret
-
-L(512bytesormore):
-# ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %r8
-# else
- mov __x86_shared_cache_size_half(%rip), %r8
-# endif
- cmp %r8, %rdx
- jae L(preloop_large)
- cmp $1024, %rdx
- ja L(1024bytesormore)
- prefetcht1 (%rsi)
- prefetcht1 0x40(%rsi)
- prefetcht1 0x80(%rsi)
- prefetcht1 0xC0(%rsi)
- prefetcht1 0x100(%rsi)
- prefetcht1 0x140(%rsi)
- prefetcht1 0x180(%rsi)
- prefetcht1 0x1C0(%rsi)
- prefetcht1 -0x200(%rcx)
- prefetcht1 -0x1C0(%rcx)
- prefetcht1 -0x180(%rcx)
- prefetcht1 -0x140(%rcx)
- prefetcht1 -0x100(%rcx)
- prefetcht1 -0xC0(%rcx)
- prefetcht1 -0x80(%rcx)
- prefetcht1 -0x40(%rcx)
- vmovups (%rsi), %zmm0
- vmovups 0x40(%rsi), %zmm1
- vmovups 0x80(%rsi), %zmm2
- vmovups 0xC0(%rsi), %zmm3
- vmovups 0x100(%rsi), %zmm4
- vmovups 0x140(%rsi), %zmm5
- vmovups 0x180(%rsi), %zmm6
- vmovups 0x1C0(%rsi), %zmm7
- vmovups -0x200(%rcx), %zmm8
- vmovups -0x1C0(%rcx), %zmm9
- vmovups -0x180(%rcx), %zmm10
- vmovups -0x140(%rcx), %zmm11
- vmovups -0x100(%rcx), %zmm12
- vmovups -0xC0(%rcx), %zmm13
- vmovups -0x80(%rcx), %zmm14
- vmovups -0x40(%rcx), %zmm15
- vmovups %zmm0, (%rdi)
- vmovups %zmm1, 0x40(%rdi)
- vmovups %zmm2, 0x80(%rdi)
- vmovups %zmm3, 0xC0(%rdi)
- vmovups %zmm4, 0x100(%rdi)
- vmovups %zmm5, 0x140(%rdi)
- vmovups %zmm6, 0x180(%rdi)
- vmovups %zmm7, 0x1C0(%rdi)
- vmovups %zmm8, -0x200(%r9)
- vmovups %zmm9, -0x1C0(%r9)
- vmovups %zmm10, -0x180(%r9)
- vmovups %zmm11, -0x140(%r9)
- vmovups %zmm12, -0x100(%r9)
- vmovups %zmm13, -0xC0(%r9)
- vmovups %zmm14, -0x80(%r9)
- vmovups %zmm15, -0x40(%r9)
- ret
-
-L(1024bytesormore):
- cmp %rsi, %rdi
- ja L(1024bytesormore_bkw)
- sub $512, %r9
- vmovups -0x200(%rcx), %zmm8
- vmovups -0x1C0(%rcx), %zmm9
- vmovups -0x180(%rcx), %zmm10
- vmovups -0x140(%rcx), %zmm11
- vmovups -0x100(%rcx), %zmm12
- vmovups -0xC0(%rcx), %zmm13
- vmovups -0x80(%rcx), %zmm14
- vmovups -0x40(%rcx), %zmm15
- prefetcht1 (%rsi)
- prefetcht1 0x40(%rsi)
- prefetcht1 0x80(%rsi)
- prefetcht1 0xC0(%rsi)
- prefetcht1 0x100(%rsi)
- prefetcht1 0x140(%rsi)
- prefetcht1 0x180(%rsi)
- prefetcht1 0x1C0(%rsi)
-
-/* Loop with unaligned memory access. */
-L(gobble_512bytes_loop):
- vmovups (%rsi), %zmm0
- vmovups 0x40(%rsi), %zmm1
- vmovups 0x80(%rsi), %zmm2
- vmovups 0xC0(%rsi), %zmm3
- vmovups 0x100(%rsi), %zmm4
- vmovups 0x140(%rsi), %zmm5
- vmovups 0x180(%rsi), %zmm6
- vmovups 0x1C0(%rsi), %zmm7
- add $512, %rsi
- prefetcht1 (%rsi)
- prefetcht1 0x40(%rsi)
- prefetcht1 0x80(%rsi)
- prefetcht1 0xC0(%rsi)
- prefetcht1 0x100(%rsi)
- prefetcht1 0x140(%rsi)
- prefetcht1 0x180(%rsi)
- prefetcht1 0x1C0(%rsi)
- vmovups %zmm0, (%rdi)
- vmovups %zmm1, 0x40(%rdi)
- vmovups %zmm2, 0x80(%rdi)
- vmovups %zmm3, 0xC0(%rdi)
- vmovups %zmm4, 0x100(%rdi)
- vmovups %zmm5, 0x140(%rdi)
- vmovups %zmm6, 0x180(%rdi)
- vmovups %zmm7, 0x1C0(%rdi)
- add $512, %rdi
- cmp %r9, %rdi
- jb L(gobble_512bytes_loop)
- vmovups %zmm8, (%r9)
- vmovups %zmm9, 0x40(%r9)
- vmovups %zmm10, 0x80(%r9)
- vmovups %zmm11, 0xC0(%r9)
- vmovups %zmm12, 0x100(%r9)
- vmovups %zmm13, 0x140(%r9)
- vmovups %zmm14, 0x180(%r9)
- vmovups %zmm15, 0x1C0(%r9)
- ret
-
-L(1024bytesormore_bkw):
- add $512, %rdi
- vmovups 0x1C0(%rsi), %zmm8
- vmovups 0x180(%rsi), %zmm9
- vmovups 0x140(%rsi), %zmm10
- vmovups 0x100(%rsi), %zmm11
- vmovups 0xC0(%rsi), %zmm12
- vmovups 0x80(%rsi), %zmm13
- vmovups 0x40(%rsi), %zmm14
- vmovups (%rsi), %zmm15
- prefetcht1 -0x40(%rcx)
- prefetcht1 -0x80(%rcx)
- prefetcht1 -0xC0(%rcx)
- prefetcht1 -0x100(%rcx)
- prefetcht1 -0x140(%rcx)
- prefetcht1 -0x180(%rcx)
- prefetcht1 -0x1C0(%rcx)
- prefetcht1 -0x200(%rcx)
-
-/* Backward loop with unaligned memory access. */
-L(gobble_512bytes_loop_bkw):
- vmovups -0x40(%rcx), %zmm0
- vmovups -0x80(%rcx), %zmm1
- vmovups -0xC0(%rcx), %zmm2
- vmovups -0x100(%rcx), %zmm3
- vmovups -0x140(%rcx), %zmm4
- vmovups -0x180(%rcx), %zmm5
- vmovups -0x1C0(%rcx), %zmm6
- vmovups -0x200(%rcx), %zmm7
- sub $512, %rcx
- prefetcht1 -0x40(%rcx)
- prefetcht1 -0x80(%rcx)
- prefetcht1 -0xC0(%rcx)
- prefetcht1 -0x100(%rcx)
- prefetcht1 -0x140(%rcx)
- prefetcht1 -0x180(%rcx)
- prefetcht1 -0x1C0(%rcx)
- prefetcht1 -0x200(%rcx)
- vmovups %zmm0, -0x40(%r9)
- vmovups %zmm1, -0x80(%r9)
- vmovups %zmm2, -0xC0(%r9)
- vmovups %zmm3, -0x100(%r9)
- vmovups %zmm4, -0x140(%r9)
- vmovups %zmm5, -0x180(%r9)
- vmovups %zmm6, -0x1C0(%r9)
- vmovups %zmm7, -0x200(%r9)
- sub $512, %r9
- cmp %rdi, %r9
- ja L(gobble_512bytes_loop_bkw)
- vmovups %zmm8, -0x40(%rdi)
- vmovups %zmm9, -0x80(%rdi)
- vmovups %zmm10, -0xC0(%rdi)
- vmovups %zmm11, -0x100(%rdi)
- vmovups %zmm12, -0x140(%rdi)
- vmovups %zmm13, -0x180(%rdi)
- vmovups %zmm14, -0x1C0(%rdi)
- vmovups %zmm15, -0x200(%rdi)
- ret
-
-L(preloop_large):
- cmp %rsi, %rdi
- ja L(preloop_large_bkw)
- vmovups (%rsi), %zmm4
- vmovups 0x40(%rsi), %zmm5
-
-/* Align destination for access with non-temporal stores in the loop. */
- mov %rdi, %r8
- and $-0x80, %rdi
- add $0x80, %rdi
- sub %rdi, %r8
- sub %r8, %rsi
- add %r8, %rdx
-L(gobble_256bytes_nt_loop):
- prefetcht1 0x200(%rsi)
- prefetcht1 0x240(%rsi)
- prefetcht1 0x280(%rsi)
- prefetcht1 0x2C0(%rsi)
- prefetcht1 0x300(%rsi)
- prefetcht1 0x340(%rsi)
- prefetcht1 0x380(%rsi)
- prefetcht1 0x3C0(%rsi)
- vmovdqu64 (%rsi), %zmm0
- vmovdqu64 0x40(%rsi), %zmm1
- vmovdqu64 0x80(%rsi), %zmm2
- vmovdqu64 0xC0(%rsi), %zmm3
- vmovntdq %zmm0, (%rdi)
- vmovntdq %zmm1, 0x40(%rdi)
- vmovntdq %zmm2, 0x80(%rdi)
- vmovntdq %zmm3, 0xC0(%rdi)
- sub $256, %rdx
- add $256, %rsi
- add $256, %rdi
- cmp $256, %rdx
- ja L(gobble_256bytes_nt_loop)
- sfence
- vmovups %zmm4, (%rax)
- vmovups %zmm5, 0x40(%rax)
- jmp L(check)
-
-L(preloop_large_bkw):
- vmovups -0x80(%rcx), %zmm4
- vmovups -0x40(%rcx), %zmm5
-
-/* Align end of destination for access with non-temporal stores. */
- mov %r9, %r8
- and $-0x80, %r9
- sub %r9, %r8
- sub %r8, %rcx
- sub %r8, %rdx
- add %r9, %r8
-L(gobble_256bytes_nt_loop_bkw):
- prefetcht1 -0x400(%rcx)
- prefetcht1 -0x3C0(%rcx)
- prefetcht1 -0x380(%rcx)
- prefetcht1 -0x340(%rcx)
- prefetcht1 -0x300(%rcx)
- prefetcht1 -0x2C0(%rcx)
- prefetcht1 -0x280(%rcx)
- prefetcht1 -0x240(%rcx)
- vmovdqu64 -0x100(%rcx), %zmm0
- vmovdqu64 -0xC0(%rcx), %zmm1
- vmovdqu64 -0x80(%rcx), %zmm2
- vmovdqu64 -0x40(%rcx), %zmm3
- vmovntdq %zmm0, -0x100(%r9)
- vmovntdq %zmm1, -0xC0(%r9)
- vmovntdq %zmm2, -0x80(%r9)
- vmovntdq %zmm3, -0x40(%r9)
- sub $256, %rdx
- sub $256, %rcx
- sub $256, %r9
- cmp $256, %rdx
- ja L(gobble_256bytes_nt_loop_bkw)
- sfence
- vmovups %zmm4, -0x80(%r8)
- vmovups %zmm5, -0x40(%r8)
- jmp L(check)
-END (__memmove_avx512_no_vzeroupper)
-
-# ifdef SHARED
-strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper)
-strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper)
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
deleted file mode 100644
index aac1515cf6..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ /dev/null
@@ -1,12 +0,0 @@
-#if IS_IN (libc)
-# define VEC_SIZE 64
-# define VEC(i) zmm##i
-# define VMOVNT vmovntdq
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
-
-# define SECTION(p) p##.avx512
-# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s
-
-# include "memmove-vec-unaligned-erms.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
deleted file mode 100644
index f9a4e9aff9..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY __memmove_ssse3_back
-#define MEMCPY_CHK __memmove_chk_ssse3_back
-#include "memcpy-ssse3-back.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
deleted file mode 100644
index 295430b1ef..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY __memmove_ssse3
-#define MEMCPY_CHK __memmove_chk_ssse3
-#include "memcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
deleted file mode 100644
index dee3ec529c..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ /dev/null
@@ -1,553 +0,0 @@
-/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
- Copyright (C) 2016-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-/* memmove/memcpy/mempcpy is implemented as:
- 1. Use overlapping load and store to avoid branch.
- 2. Load all sources into registers and store them together to avoid
- possible address overlap between source and destination.
- 3. If size is 8 * VEC_SIZE or less, load all sources into registers
- and store them together.
- 4. If address of destination > address of source, backward copy
- 4 * VEC_SIZE at a time with unaligned load and aligned store.
- Load the first 4 * VEC and last VEC before the loop and store
- them after the loop to support overlapping addresses.
- 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
- load and aligned store. Load the last 4 * VEC and first VEC
- before the loop and store them after the loop to support
- overlapping addresses.
- 6. If size >= __x86_shared_non_temporal_threshold and there is no
- overlap between destination and source, use non-temporal store
- instead of aligned store. */
-
-#include <sysdep.h>
-
-#ifndef MEMCPY_SYMBOL
-# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
-#endif
-
-#ifndef MEMPCPY_SYMBOL
-# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
-#endif
-
-#ifndef MEMMOVE_CHK_SYMBOL
-# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
-#endif
-
-#ifndef VZEROUPPER
-# if VEC_SIZE > 16
-# define VZEROUPPER vzeroupper
-# else
-# define VZEROUPPER
-# endif
-#endif
-
-/* Threshold to use Enhanced REP MOVSB. Since there is overhead to set
- up REP MOVSB operation, REP MOVSB isn't faster on short data. The
- memcpy micro benchmark in glibc shows that 2KB is the approximate
- value above which REP MOVSB becomes faster than SSE2 optimization
- on processors with Enhanced REP MOVSB. Since larger register size
- can move more data with a single load and store, the threshold is
- higher with larger register size. */
-#ifndef REP_MOVSB_THRESHOLD
-# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
-#endif
-
-#ifndef PREFETCH
-# define PREFETCH(addr) prefetcht0 addr
-#endif
-
-/* Assume 64-byte prefetch size. */
-#ifndef PREFETCH_SIZE
-# define PREFETCH_SIZE 64
-#endif
-
-#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
-
-#if PREFETCH_SIZE == 64
-# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
-# define PREFETCH_ONE_SET(dir, base, offset) \
- PREFETCH ((offset)base)
-# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
-# define PREFETCH_ONE_SET(dir, base, offset) \
- PREFETCH ((offset)base); \
- PREFETCH ((offset + dir * PREFETCH_SIZE)base)
-# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
-# define PREFETCH_ONE_SET(dir, base, offset) \
- PREFETCH ((offset)base); \
- PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
- PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
- PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
- PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
-# else
-# error Unsupported PREFETCHED_LOAD_SIZE!
-# endif
-#else
-# error Unsupported PREFETCH_SIZE!
-#endif
-
-#ifndef SECTION
-# error SECTION is not defined!
-#endif
-
- .section SECTION(.text),"ax",@progbits
-#if defined SHARED && IS_IN (libc)
-ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
-#endif
-
-#if VEC_SIZE == 16 || defined SHARED
-ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
- movq %rdi, %rax
- addq %rdx, %rax
- jmp L(start)
-END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
-#endif
-
-#if defined SHARED && IS_IN (libc)
-ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
-#endif
-
-ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
- movq %rdi, %rax
-L(start):
- cmpq $VEC_SIZE, %rdx
- jb L(less_vec)
- cmpq $(VEC_SIZE * 2), %rdx
- ja L(more_2x_vec)
-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(last_2x_vec):
-#endif
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
- VMOVU (%rsi), %VEC(0)
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
- VMOVU %VEC(0), (%rdi)
- VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
- VZEROUPPER
-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(nop):
-#endif
- ret
-#if defined USE_MULTIARCH && IS_IN (libc)
-END (MEMMOVE_SYMBOL (__memmove, unaligned))
-
-# if VEC_SIZE == 16
-# if defined SHARED
-/* Only used to measure performance of REP MOVSB. */
-ENTRY (__mempcpy_erms)
- movq %rdi, %rax
- addq %rdx, %rax
- jmp L(start_movsb)
-END (__mempcpy_erms)
-# endif
-
-ENTRY (__memmove_erms)
- movq %rdi, %rax
-L(start_movsb):
- movq %rdx, %rcx
- cmpq %rsi, %rdi
- jb 1f
- /* Source == destination is less common. */
- je 2f
- leaq (%rsi,%rcx), %rdx
- cmpq %rdx, %rdi
- jb L(movsb_backward)
-1:
- rep movsb
-2:
- ret
-L(movsb_backward):
- leaq -1(%rdi,%rcx), %rdi
- leaq -1(%rsi,%rcx), %rsi
- std
- rep movsb
- cld
- ret
-END (__memmove_erms)
-# if defined SHARED
-strong_alias (__memmove_erms, __memcpy_erms)
-# endif
-# endif
-
-# ifdef SHARED
-ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
-
-ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
- movq %rdi, %rax
- addq %rdx, %rax
- jmp L(start_erms)
-END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
-
-ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
-# endif
-
-ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
- movq %rdi, %rax
-L(start_erms):
- cmpq $VEC_SIZE, %rdx
- jb L(less_vec)
- cmpq $(VEC_SIZE * 2), %rdx
- ja L(movsb_more_2x_vec)
-L(last_2x_vec):
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
- VMOVU (%rsi), %VEC(0)
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
- VMOVU %VEC(0), (%rdi)
- VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
-L(return):
- VZEROUPPER
- ret
-
-L(movsb):
- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
- jae L(more_8x_vec)
- cmpq %rsi, %rdi
- jb 1f
- /* Source == destination is less common. */
- je L(nop)
- leaq (%rsi,%rdx), %r9
- cmpq %r9, %rdi
- /* Avoid slow backward REP MOVSB. */
-# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
-# error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
-# endif
- jb L(more_8x_vec_backward)
-1:
- movq %rdx, %rcx
- rep movsb
-L(nop):
- ret
-#endif
-
-L(less_vec):
- /* Less than 1 VEC. */
-#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
-# error Unsupported VEC_SIZE!
-#endif
-#if VEC_SIZE > 32
- cmpb $32, %dl
- jae L(between_32_63)
-#endif
-#if VEC_SIZE > 16
- cmpb $16, %dl
- jae L(between_16_31)
-#endif
- cmpb $8, %dl
- jae L(between_8_15)
- cmpb $4, %dl
- jae L(between_4_7)
- cmpb $1, %dl
- ja L(between_2_3)
- jb 1f
- movzbl (%rsi), %ecx
- movb %cl, (%rdi)
-1:
- ret
-#if VEC_SIZE > 32
-L(between_32_63):
- /* From 32 to 63. No branch when size == 32. */
- vmovdqu (%rsi), %ymm0
- vmovdqu -32(%rsi,%rdx), %ymm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, -32(%rdi,%rdx)
- VZEROUPPER
- ret
-#endif
-#if VEC_SIZE > 16
- /* From 16 to 31. No branch when size == 16. */
-L(between_16_31):
- vmovdqu (%rsi), %xmm0
- vmovdqu -16(%rsi,%rdx), %xmm1
- vmovdqu %xmm0, (%rdi)
- vmovdqu %xmm1, -16(%rdi,%rdx)
- ret
-#endif
-L(between_8_15):
- /* From 8 to 15. No branch when size == 8. */
- movq -8(%rsi,%rdx), %rcx
- movq (%rsi), %rsi
- movq %rcx, -8(%rdi,%rdx)
- movq %rsi, (%rdi)
- ret
-L(between_4_7):
- /* From 4 to 7. No branch when size == 4. */
- movl -4(%rsi,%rdx), %ecx
- movl (%rsi), %esi
- movl %ecx, -4(%rdi,%rdx)
- movl %esi, (%rdi)
- ret
-L(between_2_3):
- /* From 2 to 3. No branch when size == 2. */
- movzwl -2(%rsi,%rdx), %ecx
- movzwl (%rsi), %esi
- movw %cx, -2(%rdi,%rdx)
- movw %si, (%rdi)
- ret
-
-#if defined USE_MULTIARCH && IS_IN (libc)
-L(movsb_more_2x_vec):
- cmpq $REP_MOVSB_THRESHOLD, %rdx
- ja L(movsb)
-#endif
-L(more_2x_vec):
- /* More than 2 * VEC and there may be overlap between destination
- and source. */
- cmpq $(VEC_SIZE * 8), %rdx
- ja L(more_8x_vec)
- cmpq $(VEC_SIZE * 4), %rdx
- jb L(last_4x_vec)
- /* Copy from 4 * VEC to 8 * VEC, inclusively. */
- VMOVU (%rsi), %VEC(0)
- VMOVU VEC_SIZE(%rsi), %VEC(1)
- VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
- VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
- VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
- VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
- VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
- VMOVU %VEC(0), (%rdi)
- VMOVU %VEC(1), VEC_SIZE(%rdi)
- VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
- VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
- VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
- VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
- VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
- VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
- VZEROUPPER
- ret
-L(last_4x_vec):
- /* Copy from 2 * VEC to 4 * VEC. */
- VMOVU (%rsi), %VEC(0)
- VMOVU VEC_SIZE(%rsi), %VEC(1)
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
- VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
- VMOVU %VEC(0), (%rdi)
- VMOVU %VEC(1), VEC_SIZE(%rdi)
- VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
- VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
- VZEROUPPER
- ret
-
-L(more_8x_vec):
- cmpq %rsi, %rdi
- ja L(more_8x_vec_backward)
- /* Source == destination is less common. */
- je L(nop)
- /* Load the first VEC and last 4 * VEC to support overlapping
- addresses. */
- VMOVU (%rsi), %VEC(4)
- VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
- VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
- VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
- VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
- /* Save start and stop of the destination buffer. */
- movq %rdi, %r11
- leaq -VEC_SIZE(%rdi, %rdx), %rcx
- /* Align destination for aligned stores in the loop. Compute
- how much destination is misaligned. */
- movq %rdi, %r8
- andq $(VEC_SIZE - 1), %r8
- /* Get the negative of offset for alignment. */
- subq $VEC_SIZE, %r8
- /* Adjust source. */
- subq %r8, %rsi
- /* Adjust destination which should be aligned now. */
- subq %r8, %rdi
- /* Adjust length. */
- addq %r8, %rdx
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
- /* Check non-temporal store threshold. */
- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
- ja L(large_forward)
-#endif
-L(loop_4x_vec_forward):
- /* Copy 4 * VEC a time forward. */
- VMOVU (%rsi), %VEC(0)
- VMOVU VEC_SIZE(%rsi), %VEC(1)
- VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
- VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
- addq $(VEC_SIZE * 4), %rsi
- subq $(VEC_SIZE * 4), %rdx
- VMOVA %VEC(0), (%rdi)
- VMOVA %VEC(1), VEC_SIZE(%rdi)
- VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
- VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
- addq $(VEC_SIZE * 4), %rdi
- cmpq $(VEC_SIZE * 4), %rdx
- ja L(loop_4x_vec_forward)
- /* Store the last 4 * VEC. */
- VMOVU %VEC(5), (%rcx)
- VMOVU %VEC(6), -VEC_SIZE(%rcx)
- VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
- VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
- /* Store the first VEC. */
- VMOVU %VEC(4), (%r11)
- VZEROUPPER
- ret
-
-L(more_8x_vec_backward):
- /* Load the first 4 * VEC and last VEC to support overlapping
- addresses. */
- VMOVU (%rsi), %VEC(4)
- VMOVU VEC_SIZE(%rsi), %VEC(5)
- VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
- VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
- /* Save stop of the destination buffer. */
- leaq -VEC_SIZE(%rdi, %rdx), %r11
- /* Align destination end for aligned stores in the loop. Compute
- how much destination end is misaligned. */
- leaq -VEC_SIZE(%rsi, %rdx), %rcx
- movq %r11, %r9
- movq %r11, %r8
- andq $(VEC_SIZE - 1), %r8
- /* Adjust source. */
- subq %r8, %rcx
- /* Adjust the end of destination which should be aligned now. */
- subq %r8, %r9
- /* Adjust length. */
- subq %r8, %rdx
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
- /* Check non-temporal store threshold. */
- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
- ja L(large_backward)
-#endif
-L(loop_4x_vec_backward):
- /* Copy 4 * VEC a time backward. */
- VMOVU (%rcx), %VEC(0)
- VMOVU -VEC_SIZE(%rcx), %VEC(1)
- VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
- VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
- subq $(VEC_SIZE * 4), %rcx
- subq $(VEC_SIZE * 4), %rdx
- VMOVA %VEC(0), (%r9)
- VMOVA %VEC(1), -VEC_SIZE(%r9)
- VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
- VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
- subq $(VEC_SIZE * 4), %r9
- cmpq $(VEC_SIZE * 4), %rdx
- ja L(loop_4x_vec_backward)
- /* Store the first 4 * VEC. */
- VMOVU %VEC(4), (%rdi)
- VMOVU %VEC(5), VEC_SIZE(%rdi)
- VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
- VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
- /* Store the last VEC. */
- VMOVU %VEC(8), (%r11)
- VZEROUPPER
- ret
-
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-L(large_forward):
- /* Don't use non-temporal store if there is overlap between
- destination and source since destination may be in cache
- when source is loaded. */
- leaq (%rdi, %rdx), %r10
- cmpq %r10, %rsi
- jb L(loop_4x_vec_forward)
-L(loop_large_forward):
- /* Copy 4 * VEC a time forward with non-temporal stores. */
- PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
- PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
- VMOVU (%rsi), %VEC(0)
- VMOVU VEC_SIZE(%rsi), %VEC(1)
- VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
- VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
- addq $PREFETCHED_LOAD_SIZE, %rsi
- subq $PREFETCHED_LOAD_SIZE, %rdx
- VMOVNT %VEC(0), (%rdi)
- VMOVNT %VEC(1), VEC_SIZE(%rdi)
- VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
- VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
- addq $PREFETCHED_LOAD_SIZE, %rdi
- cmpq $PREFETCHED_LOAD_SIZE, %rdx
- ja L(loop_large_forward)
- sfence
- /* Store the last 4 * VEC. */
- VMOVU %VEC(5), (%rcx)
- VMOVU %VEC(6), -VEC_SIZE(%rcx)
- VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
- VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
- /* Store the first VEC. */
- VMOVU %VEC(4), (%r11)
- VZEROUPPER
- ret
-
-L(large_backward):
- /* Don't use non-temporal store if there is overlap between
- destination and source since destination may be in cache
- when source is loaded. */
- leaq (%rcx, %rdx), %r10
- cmpq %r10, %r9
- jb L(loop_4x_vec_backward)
-L(loop_large_backward):
- /* Copy 4 * VEC a time backward with non-temporal stores. */
- PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
- PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
- VMOVU (%rcx), %VEC(0)
- VMOVU -VEC_SIZE(%rcx), %VEC(1)
- VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
- VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
- subq $PREFETCHED_LOAD_SIZE, %rcx
- subq $PREFETCHED_LOAD_SIZE, %rdx
- VMOVNT %VEC(0), (%r9)
- VMOVNT %VEC(1), -VEC_SIZE(%r9)
- VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
- VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
- subq $PREFETCHED_LOAD_SIZE, %r9
- cmpq $PREFETCHED_LOAD_SIZE, %rdx
- ja L(loop_large_backward)
- sfence
- /* Store the first 4 * VEC. */
- VMOVU %VEC(4), (%rdi)
- VMOVU %VEC(5), VEC_SIZE(%rdi)
- VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
- VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
- /* Store the last VEC. */
- VMOVU %VEC(8), (%r11)
- VZEROUPPER
- ret
-#endif
-END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
-
-#ifdef SHARED
-# if IS_IN (libc)
-# ifdef USE_MULTIARCH
-strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
- MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
-strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
- MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
-# endif
-strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
- MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
-# endif
-#endif
-#if VEC_SIZE == 16 || defined SHARED
-strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
- MEMCPY_SYMBOL (__memcpy, unaligned))
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove.S b/sysdeps/x86_64/multiarch/memmove.S
deleted file mode 100644
index 8c534e83e0..0000000000
--- a/sysdeps/x86_64/multiarch/memmove.S
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Multiple versions of memmove
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2016-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in lib and for
- DSO. */
-#if IS_IN (libc)
- .text
-ENTRY(__libc_memmove)
- .type __libc_memmove, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- lea __memmove_erms(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Prefer_ERMS)
- jnz 2f
- HAS_ARCH_FEATURE (Prefer_No_AVX512)
- jnz 1f
- HAS_ARCH_FEATURE (AVX512F_Usable)
- jz 1f
- lea __memmove_avx512_no_vzeroupper(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
- jnz 2f
- lea __memmove_avx512_unaligned_erms(%rip), %RAX_LP
- HAS_CPU_FEATURE (ERMS)
- jnz 2f
- lea __memmove_avx512_unaligned(%rip), %RAX_LP
- ret
-1: lea __memmove_avx_unaligned(%rip), %RAX_LP
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz L(Fast_Unaligned_Load)
- HAS_CPU_FEATURE (ERMS)
- jz 2f
- lea __memmove_avx_unaligned_erms(%rip), %RAX_LP
- ret
-L(Fast_Unaligned_Load):
- lea __memmove_sse2_unaligned(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
- jz L(SSSE3)
- HAS_CPU_FEATURE (ERMS)
- jz 2f
- lea __memmove_sse2_unaligned_erms(%rip), %RAX_LP
- ret
-L(SSSE3):
- HAS_CPU_FEATURE (SSSE3)
- jz 2f
- lea __memmove_ssse3_back(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jnz 2f
- lea __memmove_ssse3(%rip), %RAX_LP
-2: ret
-END(__libc_memmove)
-#endif
-
-#if IS_IN (libc)
-# define MEMMOVE_SYMBOL(p,s) p##_sse2_##s
-
-# ifdef SHARED
-libc_hidden_ver (__memmove_sse2_unaligned, memmove)
-libc_hidden_ver (__memcpy_sse2_unaligned, memcpy)
-libc_hidden_ver (__mempcpy_sse2_unaligned, mempcpy)
-libc_hidden_ver (__mempcpy_sse2_unaligned, __mempcpy)
-
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal memmove calls through a PLT.
- The speedup we get from using SSE2 instructions is likely eaten away
- by the indirect call in the PLT. */
-# define libc_hidden_builtin_def
-# endif
-strong_alias (__libc_memmove, memmove)
-#endif
-
-#if !defined SHARED || !IS_IN (libc)
-weak_alias (__mempcpy, mempcpy)
-#endif
-
-#include "../memmove.S"
-
-#if defined SHARED && IS_IN (libc)
-# include <shlib-compat.h>
-# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
-/* Use __memmove_sse2_unaligned to support overlapping addresses. */
-compat_symbol (libc, __memmove_sse2_unaligned, memcpy, GLIBC_2_2_5);
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove_chk.S b/sysdeps/x86_64/multiarch/memmove_chk.S
deleted file mode 100644
index 7870dd0247..0000000000
--- a/sysdeps/x86_64/multiarch/memmove_chk.S
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Multiple versions of __memmove_chk
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2016-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in lib and for
- DSO. There are no multiarch memmove functions for static binaries.
- */
-#if IS_IN (libc)
-# ifdef SHARED
- .text
-ENTRY(__memmove_chk)
- .type __memmove_chk, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- HAS_ARCH_FEATURE (Prefer_No_AVX512)
- jnz 1f
- HAS_ARCH_FEATURE (AVX512F_Usable)
- jz 1f
- lea __memmove_chk_avx512_no_vzeroupper(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
- jnz 2f
- lea __memmove_chk_avx512_unaligned_erms(%rip), %RAX_LP
- HAS_CPU_FEATURE (ERMS)
- jnz 2f
- lea __memmove_chk_avx512_unaligned(%rip), %RAX_LP
- ret
-1: lea __memmove_chk_avx_unaligned(%rip), %RAX_LP
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz L(Fast_Unaligned_Load)
- HAS_CPU_FEATURE (ERMS)
- jz 2f
- lea __memmove_chk_avx_unaligned_erms(%rip), %RAX_LP
- ret
-L(Fast_Unaligned_Load):
- lea __memmove_chk_sse2_unaligned(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
- jz L(SSSE3)
- HAS_CPU_FEATURE (ERMS)
- jz 2f
- lea __memmove_chk_sse2_unaligned_erms(%rip), %RAX_LP
- ret
-L(SSSE3):
- HAS_CPU_FEATURE (SSSE3)
- jz 2f
- lea __memmove_chk_ssse3_back(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jnz 2f
- lea __memmove_chk_ssse3(%rip), %RAX_LP
-2: ret
-END(__memmove_chk)
-# else
-# include "../memmove_chk.S"
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
deleted file mode 100644
index b8b2b28094..0000000000
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Multiple versions of mempcpy
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in lib and for
- DSO. In static binaries we need mempcpy before the initialization
- happened. */
-#if defined SHARED && IS_IN (libc)
- .text
-ENTRY(__mempcpy)
- .type __mempcpy, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- lea __mempcpy_erms(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Prefer_ERMS)
- jnz 2f
- HAS_ARCH_FEATURE (Prefer_No_AVX512)
- jnz 1f
- HAS_ARCH_FEATURE (AVX512F_Usable)
- jz 1f
- lea __mempcpy_avx512_no_vzeroupper(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
- jnz 2f
- lea __mempcpy_avx512_unaligned_erms(%rip), %RAX_LP
- HAS_CPU_FEATURE (ERMS)
- jnz 2f
- lea __mempcpy_avx512_unaligned(%rip), %RAX_LP
- ret
-1: lea __mempcpy_avx_unaligned(%rip), %RAX_LP
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz L(Fast_Unaligned_Load)
- HAS_CPU_FEATURE (ERMS)
- jz 2f
- lea __mempcpy_avx_unaligned_erms(%rip), %RAX_LP
- ret
-L(Fast_Unaligned_Load):
- lea __mempcpy_sse2_unaligned(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
- jz L(SSSE3)
- HAS_CPU_FEATURE (ERMS)
- jz 2f
- lea __mempcpy_sse2_unaligned_erms(%rip), %RAX_LP
- ret
-L(SSSE3):
- HAS_CPU_FEATURE (SSSE3)
- jz 2f
- lea __mempcpy_ssse3_back(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jnz 2f
- lea __mempcpy_ssse3(%rip), %RAX_LP
-2: ret
-END(__mempcpy)
-
-weak_alias (__mempcpy, mempcpy)
-#endif
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S
deleted file mode 100644
index 072b22c49f..0000000000
--- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Multiple versions of __mempcpy_chk
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in lib and for
- DSO. There are no multiarch mempcpy functions for static binaries.
- */
-#if IS_IN (libc)
-# ifdef SHARED
- .text
-ENTRY(__mempcpy_chk)
- .type __mempcpy_chk, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- HAS_ARCH_FEATURE (Prefer_No_AVX512)
- jnz 1f
- HAS_ARCH_FEATURE (AVX512F_Usable)
- jz 1f
- lea __mempcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
- jnz 2f
- lea __mempcpy_chk_avx512_unaligned_erms(%rip), %RAX_LP
- HAS_CPU_FEATURE (ERMS)
- jnz 2f
- lea __mempcpy_chk_avx512_unaligned(%rip), %RAX_LP
- ret
-1: lea __mempcpy_chk_avx_unaligned(%rip), %RAX_LP
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz L(Fast_Unaligned_Load)
- HAS_CPU_FEATURE (ERMS)
- jz 2f
- lea __mempcpy_chk_avx_unaligned_erms(%rip), %RAX_LP
- ret
-L(Fast_Unaligned_Load):
- lea __mempcpy_chk_sse2_unaligned(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
- jz L(SSSE3)
- HAS_CPU_FEATURE (ERMS)
- jz 2f
- lea __mempcpy_chk_sse2_unaligned_erms(%rip), %RAX_LP
- ret
-L(SSSE3):
- HAS_CPU_FEATURE (SSSE3)
- jz 2f
- lea __mempcpy_chk_ssse3_back(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jnz 2f
- lea __mempcpy_chk_ssse3(%rip), %RAX_LP
-2: ret
-END(__mempcpy_chk)
-# else
-# include "../mempcpy_chk.S"
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
deleted file mode 100644
index 7ab3d89849..0000000000
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ /dev/null
@@ -1,22 +0,0 @@
-#if IS_IN (libc)
-# define VEC_SIZE 32
-# define VEC(i) ymm##i
-# define VMOVU vmovdqu
-# define VMOVA vmovdqa
-
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- vmovd d, %xmm0; \
- movq r, %rax; \
- vpbroadcastb %xmm0, %ymm0
-
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- vmovd d, %xmm0; \
- movq r, %rax; \
- vpbroadcastd %xmm0, %ymm0
-
-# define SECTION(p) p##.avx
-# define MEMSET_SYMBOL(p,s) p##_avx2_##s
-# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
-
-# include "memset-vec-unaligned-erms.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
deleted file mode 100644
index 1f66602398..0000000000
--- a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
+++ /dev/null
@@ -1,194 +0,0 @@
-/* memset optimized with AVX512 for KNL hardware.
- Copyright (C) 2015-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-#ifndef MEMSET
-# define MEMSET __memset_avx512_no_vzeroupper
-# define MEMSET_CHK __memset_chk_avx512_no_vzeroupper
-#endif
-
- .section .text.avx512,"ax",@progbits
-#if defined PIC
-ENTRY (MEMSET_CHK)
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMSET_CHK)
-#endif
-
-ENTRY (MEMSET)
- vpxor %xmm0, %xmm0, %xmm0
- vmovd %esi, %xmm1
- lea (%rdi, %rdx), %rsi
- mov %rdi, %rax
- vpshufb %xmm0, %xmm1, %xmm0
- cmp $16, %rdx
- jb L(less_16bytes)
- cmp $512, %rdx
- vbroadcastss %xmm0, %zmm2
- ja L(512bytesormore)
- cmp $256, %rdx
- jb L(less_256bytes)
- vmovups %zmm2, (%rdi)
- vmovups %zmm2, 0x40(%rdi)
- vmovups %zmm2, 0x80(%rdi)
- vmovups %zmm2, 0xC0(%rdi)
- vmovups %zmm2, -0x100(%rsi)
- vmovups %zmm2, -0xC0(%rsi)
- vmovups %zmm2, -0x80(%rsi)
- vmovups %zmm2, -0x40(%rsi)
- ret
-
-L(less_256bytes):
- cmp $128, %dl
- jb L(less_128bytes)
- vmovups %zmm2, (%rdi)
- vmovups %zmm2, 0x40(%rdi)
- vmovups %zmm2, -0x80(%rsi)
- vmovups %zmm2, -0x40(%rsi)
- ret
-
-L(less_128bytes):
- cmp $64, %dl
- jb L(less_64bytes)
- vmovups %zmm2, (%rdi)
- vmovups %zmm2, -0x40(%rsi)
- ret
-
-L(less_64bytes):
- cmp $32, %dl
- jb L(less_32bytes)
- vmovdqu %ymm2, (%rdi)
- vmovdqu %ymm2, -0x20(%rsi)
- ret
-
-L(less_32bytes):
- vmovdqu %xmm0, (%rdi)
- vmovdqu %xmm0, -0x10(%rsi)
- ret
-
-L(less_16bytes):
- cmp $8, %dl
- jb L(less_8bytes)
- vmovq %xmm0, (%rdi)
- vmovq %xmm0, -0x08(%rsi)
- ret
-
-L(less_8bytes):
- vmovd %xmm0, %ecx
- cmp $4, %dl
- jb L(less_4bytes)
- mov %ecx, (%rdi)
- mov %ecx, -0x04(%rsi)
- ret
-
-L(less_4bytes):
- cmp $2, %dl
- jb L(less_2bytes)
- mov %cx, (%rdi)
- mov %cx, -0x02(%rsi)
- ret
-
-L(less_2bytes):
- cmp $1, %dl
- jb L(less_1bytes)
- mov %cl, (%rdi)
-L(less_1bytes):
- ret
-
-L(512bytesormore):
- mov __x86_shared_cache_size_half(%rip), %rcx
- cmp %rcx, %rdx
- ja L(preloop_large)
- cmp $1024, %rdx
- ja L(1024bytesormore)
-
- vmovups %zmm2, (%rdi)
- vmovups %zmm2, 0x40(%rdi)
- vmovups %zmm2, 0x80(%rdi)
- vmovups %zmm2, 0xC0(%rdi)
- vmovups %zmm2, 0x100(%rdi)
- vmovups %zmm2, 0x140(%rdi)
- vmovups %zmm2, 0x180(%rdi)
- vmovups %zmm2, 0x1C0(%rdi)
- vmovups %zmm2, -0x200(%rsi)
- vmovups %zmm2, -0x1C0(%rsi)
- vmovups %zmm2, -0x180(%rsi)
- vmovups %zmm2, -0x140(%rsi)
- vmovups %zmm2, -0x100(%rsi)
- vmovups %zmm2, -0xC0(%rsi)
- vmovups %zmm2, -0x80(%rsi)
- vmovups %zmm2, -0x40(%rsi)
- ret
-
-/* Align on 64 and loop with aligned stores. */
-L(1024bytesormore):
- sub $0x100, %rsi
- vmovups %zmm2, (%rax)
- and $-0x40, %rdi
- add $0x40, %rdi
-
-L(gobble_256bytes_loop):
- vmovaps %zmm2, (%rdi)
- vmovaps %zmm2, 0x40(%rdi)
- vmovaps %zmm2, 0x80(%rdi)
- vmovaps %zmm2, 0xC0(%rdi)
- add $0x100, %rdi
- cmp %rsi, %rdi
- jb L(gobble_256bytes_loop)
- vmovups %zmm2, (%rsi)
- vmovups %zmm2, 0x40(%rsi)
- vmovups %zmm2, 0x80(%rsi)
- vmovups %zmm2, 0xC0(%rsi)
- ret
-
-/* Align on 128 and loop with non-temporal stores. */
-L(preloop_large):
- and $-0x80, %rdi
- add $0x80, %rdi
- vmovups %zmm2, (%rax)
- vmovups %zmm2, 0x40(%rax)
- sub $0x200, %rsi
-
-L(gobble_512bytes_nt_loop):
- vmovntdq %zmm2, (%rdi)
- vmovntdq %zmm2, 0x40(%rdi)
- vmovntdq %zmm2, 0x80(%rdi)
- vmovntdq %zmm2, 0xC0(%rdi)
- vmovntdq %zmm2, 0x100(%rdi)
- vmovntdq %zmm2, 0x140(%rdi)
- vmovntdq %zmm2, 0x180(%rdi)
- vmovntdq %zmm2, 0x1C0(%rdi)
- add $0x200, %rdi
- cmp %rsi, %rdi
- jb L(gobble_512bytes_nt_loop)
- sfence
- vmovups %zmm2, (%rsi)
- vmovups %zmm2, 0x40(%rsi)
- vmovups %zmm2, 0x80(%rsi)
- vmovups %zmm2, 0xC0(%rsi)
- vmovups %zmm2, 0x100(%rsi)
- vmovups %zmm2, 0x140(%rsi)
- vmovups %zmm2, 0x180(%rsi)
- vmovups %zmm2, 0x1C0(%rsi)
- ret
-END (MEMSET)
-#endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
deleted file mode 100644
index 0783979ca5..0000000000
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ /dev/null
@@ -1,24 +0,0 @@
-#if IS_IN (libc)
-# define VEC_SIZE 64
-# define VEC(i) zmm##i
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
-
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- vmovd d, %xmm0; \
- movq r, %rax; \
- vpbroadcastb %xmm0, %xmm0; \
- vpbroadcastq %xmm0, %zmm0
-
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- vmovd d, %xmm0; \
- movq r, %rax; \
- vpbroadcastd %xmm0, %xmm0; \
- vpbroadcastq %xmm0, %zmm0
-
-# define SECTION(p) p##.avx512
-# define MEMSET_SYMBOL(p,s) p##_avx512_##s
-# define WMEMSET_SYMBOL(p,s) p##_avx512_##s
-
-# include "memset-vec-unaligned-erms.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
deleted file mode 100644
index 2eb9e3744e..0000000000
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ /dev/null
@@ -1,263 +0,0 @@
-/* memset/bzero with unaligned store and rep stosb
- Copyright (C) 2016-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-/* memset is implemented as:
- 1. Use overlapping store to avoid branch.
- 2. If size is less than VEC, use integer register stores.
- 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
- 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
- 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
- 4 VEC stores and store 4 * VEC at a time until done. */
-
-#include <sysdep.h>
-
-#ifndef MEMSET_CHK_SYMBOL
-# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
-#endif
-
-#ifndef WMEMSET_CHK_SYMBOL
-# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
-#endif
-
-#ifndef VZEROUPPER
-# if VEC_SIZE > 16
-# define VZEROUPPER vzeroupper
-# else
-# define VZEROUPPER
-# endif
-#endif
-
-#ifndef VZEROUPPER_SHORT_RETURN
-# if VEC_SIZE > 16
-# define VZEROUPPER_SHORT_RETURN vzeroupper
-# else
-# define VZEROUPPER_SHORT_RETURN rep
-# endif
-#endif
-
-#ifndef MOVQ
-# if VEC_SIZE > 16
-# define MOVQ vmovq
-# else
-# define MOVQ movq
-# endif
-#endif
-
-/* Threshold to use Enhanced REP STOSB. Since there is overhead to set
- up REP STOSB operation, REP STOSB isn't faster on short data. The
- memset micro benchmark in glibc shows that 2KB is the approximate
- value above which REP STOSB becomes faster on processors with
- Enhanced REP STOSB. Since the stored value is fixed, larger register
- size has minimal impact on threshold. */
-#ifndef REP_STOSB_THRESHOLD
-# define REP_STOSB_THRESHOLD 2048
-#endif
-
-#ifndef SECTION
-# error SECTION is not defined!
-#endif
-
- .section SECTION(.text),"ax",@progbits
-#if VEC_SIZE == 16 && IS_IN (libc)
-ENTRY (__bzero)
- movq %rdi, %rax /* Set return value. */
- movq %rsi, %rdx /* Set n. */
- pxor %xmm0, %xmm0
- jmp L(entry_from_bzero)
-END (__bzero)
-weak_alias (__bzero, bzero)
-#endif
-
-#if IS_IN (libc)
-# if defined SHARED
-ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
-# endif
-
-ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
- shlq $2, %rdx
- WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
- jmp L(entry_from_bzero)
-END (WMEMSET_SYMBOL (__wmemset, unaligned))
-#endif
-
-#if defined SHARED && IS_IN (libc)
-ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
-#endif
-
-ENTRY (MEMSET_SYMBOL (__memset, unaligned))
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
-L(entry_from_bzero):
- cmpq $VEC_SIZE, %rdx
- jb L(less_vec)
- cmpq $(VEC_SIZE * 2), %rdx
- ja L(more_2x_vec)
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
- VMOVU %VEC(0), (%rdi)
- VZEROUPPER
- ret
-#if defined USE_MULTIARCH && IS_IN (libc)
-END (MEMSET_SYMBOL (__memset, unaligned))
-
-# if VEC_SIZE == 16
-/* Only used to measure performance of REP STOSB. */
-ENTRY (__memset_erms)
-# else
-/* Provide a symbol to debugger. */
-ENTRY (MEMSET_SYMBOL (__memset, erms))
-# endif
-L(stosb):
- /* Issue vzeroupper before rep stosb. */
- VZEROUPPER
- movq %rdx, %rcx
- movzbl %sil, %eax
- movq %rdi, %rdx
- rep stosb
- movq %rdx, %rax
- ret
-# if VEC_SIZE == 16
-END (__memset_erms)
-# else
-END (MEMSET_SYMBOL (__memset, erms))
-# endif
-
-# if defined SHARED && IS_IN (libc)
-ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
-# endif
-
-ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
- cmpq $VEC_SIZE, %rdx
- jb L(less_vec)
- cmpq $(VEC_SIZE * 2), %rdx
- ja L(stosb_more_2x_vec)
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
- VMOVU %VEC(0), (%rdi)
- VZEROUPPER
- ret
-
-L(stosb_more_2x_vec):
- cmpq $REP_STOSB_THRESHOLD, %rdx
- ja L(stosb)
-#endif
-L(more_2x_vec):
- cmpq $(VEC_SIZE * 4), %rdx
- ja L(loop_start)
- VMOVU %VEC(0), (%rdi)
- VMOVU %VEC(0), VEC_SIZE(%rdi)
- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
- VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
-L(return):
- VZEROUPPER
- ret
-
-L(loop_start):
- leaq (VEC_SIZE * 4)(%rdi), %rcx
- VMOVU %VEC(0), (%rdi)
- andq $-(VEC_SIZE * 4), %rcx
- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
- VMOVU %VEC(0), VEC_SIZE(%rdi)
- VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
- VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
- VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
- VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
- VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
- addq %rdi, %rdx
- andq $-(VEC_SIZE * 4), %rdx
- cmpq %rdx, %rcx
- je L(return)
-L(loop):
- VMOVA %VEC(0), (%rcx)
- VMOVA %VEC(0), VEC_SIZE(%rcx)
- VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
- VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
- addq $(VEC_SIZE * 4), %rcx
- cmpq %rcx, %rdx
- jne L(loop)
- VZEROUPPER_SHORT_RETURN
- ret
-L(less_vec):
- /* Less than 1 VEC. */
-# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
-# error Unsupported VEC_SIZE!
-# endif
-# if VEC_SIZE > 32
- cmpb $32, %dl
- jae L(between_32_63)
-# endif
-# if VEC_SIZE > 16
- cmpb $16, %dl
- jae L(between_16_31)
-# endif
- MOVQ %xmm0, %rcx
- cmpb $8, %dl
- jae L(between_8_15)
- cmpb $4, %dl
- jae L(between_4_7)
- cmpb $1, %dl
- ja L(between_2_3)
- jb 1f
- movb %cl, (%rdi)
-1:
- VZEROUPPER
- ret
-# if VEC_SIZE > 32
- /* From 32 to 63. No branch when size == 32. */
-L(between_32_63):
- vmovdqu %ymm0, -32(%rdi,%rdx)
- vmovdqu %ymm0, (%rdi)
- VZEROUPPER
- ret
-# endif
-# if VEC_SIZE > 16
- /* From 16 to 31. No branch when size == 16. */
-L(between_16_31):
- vmovdqu %xmm0, -16(%rdi,%rdx)
- vmovdqu %xmm0, (%rdi)
- VZEROUPPER
- ret
-# endif
- /* From 8 to 15. No branch when size == 8. */
-L(between_8_15):
- movq %rcx, -8(%rdi,%rdx)
- movq %rcx, (%rdi)
- VZEROUPPER
- ret
-L(between_4_7):
- /* From 4 to 7. No branch when size == 4. */
- movl %ecx, -4(%rdi,%rdx)
- movl %ecx, (%rdi)
- VZEROUPPER
- ret
-L(between_2_3):
- /* From 2 to 3. No branch when size == 2. */
- movw %cx, -2(%rdi,%rdx)
- movw %cx, (%rdi)
- VZEROUPPER
- ret
-END (MEMSET_SYMBOL (__memset, unaligned_erms))
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
deleted file mode 100644
index 11f27378b0..0000000000
--- a/sysdeps/x86_64/multiarch/memset.S
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Multiple versions of memset
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2014-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <shlib-compat.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in lib. */
-#if IS_IN (libc)
-ENTRY(memset)
- .type memset, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- lea __memset_erms(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Prefer_ERMS)
- jnz 2f
- lea __memset_sse2_unaligned_erms(%rip), %RAX_LP
- HAS_CPU_FEATURE (ERMS)
- jnz 1f
- lea __memset_sse2_unaligned(%rip), %RAX_LP
-1:
- HAS_ARCH_FEATURE (AVX2_Usable)
- jz 2f
- lea __memset_avx2_unaligned_erms(%rip), %RAX_LP
- HAS_CPU_FEATURE (ERMS)
- jnz L(AVX512F)
- lea __memset_avx2_unaligned(%rip), %RAX_LP
-L(AVX512F):
- HAS_ARCH_FEATURE (Prefer_No_AVX512)
- jnz 2f
- HAS_ARCH_FEATURE (AVX512F_Usable)
- jz 2f
- lea __memset_avx512_no_vzeroupper(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
- jnz 2f
- lea __memset_avx512_unaligned_erms(%rip), %RAX_LP
- HAS_CPU_FEATURE (ERMS)
- jnz 2f
- lea __memset_avx512_unaligned(%rip), %RAX_LP
-2: ret
-END(memset)
-#endif
-
-#if IS_IN (libc)
-# define MEMSET_SYMBOL(p,s) p##_sse2_##s
-# define WMEMSET_SYMBOL(p,s) p##_sse2_##s
-
-# ifdef SHARED
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal memset calls through a PLT.
- The speedup we get from using SSE2 instructions is likely eaten away
- by the indirect call in the PLT. */
-# define libc_hidden_builtin_def(name) \
- .globl __GI_memset; __GI_memset = __memset_sse2_unaligned; \
- .globl __GI_wmemset; __GI_wmemset = __wmemset_sse2_unaligned; \
- .globl __GI___wmemset; __GI___wmemset = __wmemset_sse2_unaligned
-# endif
-
-# undef weak_alias
-# define weak_alias(original, alias) \
- .weak bzero; bzero = __bzero
-
-# undef strong_alias
-# define strong_alias(original, alias)
-#endif
-
-#include "../memset.S"
diff --git a/sysdeps/x86_64/multiarch/memset_chk.S b/sysdeps/x86_64/multiarch/memset_chk.S
deleted file mode 100644
index 7e08311cdf..0000000000
--- a/sysdeps/x86_64/multiarch/memset_chk.S
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Multiple versions of memset_chk
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2014-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in lib. */
-#if IS_IN (libc)
-# ifdef SHARED
-ENTRY(__memset_chk)
- .type __memset_chk, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- lea __memset_chk_sse2_unaligned_erms(%rip), %RAX_LP
- HAS_CPU_FEATURE (ERMS)
- jnz 1f
- lea __memset_chk_sse2_unaligned(%rip), %RAX_LP
-1:
- HAS_ARCH_FEATURE (AVX2_Usable)
- jz 2f
- lea __memset_chk_avx2_unaligned_erms(%rip), %RAX_LP
- HAS_CPU_FEATURE (ERMS)
- jnz L(AVX512F)
- lea __memset_chk_avx2_unaligned(%rip), %RAX_LP
-L(AVX512F):
- HAS_ARCH_FEATURE (Prefer_No_AVX512)
- jnz 2f
- HAS_ARCH_FEATURE (AVX512F_Usable)
- jz 2f
- lea __memset_chk_avx512_no_vzeroupper(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
- jnz 2f
- lea __memset_chk_avx512_unaligned_erms(%rip), %RAX_LP
- HAS_CPU_FEATURE (ERMS)
- jnz 2f
- lea __memset_chk_avx512_unaligned(%rip), %RAX_LP
-2: ret
-END(__memset_chk)
-
-strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
- .section .gnu.warning.__memset_zero_constant_len_parameter
- .string "memset used with constant zero length parameter; this could be due to transposed parameters"
-# else
-# include "../memset_chk.S"
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/sched_cpucount.c b/sysdeps/x86_64/multiarch/sched_cpucount.c
deleted file mode 100644
index 453f183747..0000000000
--- a/sysdeps/x86_64/multiarch/sched_cpucount.c
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Count bits in CPU set. x86-64 multi-arch version.
- This file is part of the GNU C Library.
- Copyright (C) 2008-2017 Free Software Foundation, Inc.
- Contributed by Ulrich Drepper <drepper@redhat.com>.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sched.h>
-#include "init-arch.h"
-
-#define __sched_cpucount static generic_cpucount
-#include <posix/sched_cpucount.c>
-#undef __sched_cpucount
-
-#define POPCNT(l) \
- ({ __cpu_mask r; \
- asm ("popcnt %1, %0" : "=r" (r) : "0" (l));\
- r; })
-#define __sched_cpucount static popcount_cpucount
-#include <posix/sched_cpucount.c>
-#undef __sched_cpucount
-
-libc_ifunc (__sched_cpucount,
- HAS_CPU_FEATURE (POPCOUNT) ? popcount_cpucount : generic_cpucount);
diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
deleted file mode 100644
index 34231f8b46..0000000000
--- a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STPCPY
-#define STRCPY __stpcpy_sse2_unaligned
-#include "strcpy-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
deleted file mode 100644
index d971c2da38..0000000000
--- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STPCPY
-#define STRCPY __stpcpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/stpcpy.S b/sysdeps/x86_64/multiarch/stpcpy.S
deleted file mode 100644
index ee81ab6ae3..0000000000
--- a/sysdeps/x86_64/multiarch/stpcpy.S
+++ /dev/null
@@ -1,9 +0,0 @@
-/* Multiple versions of stpcpy
- All versions must be listed in ifunc-impl-list.c. */
-#define USE_AS_STPCPY
-#define STRCPY __stpcpy
-#include "strcpy.S"
-
-weak_alias (__stpcpy, stpcpy)
-libc_hidden_def (__stpcpy)
-libc_hidden_builtin_def (stpcpy)
diff --git a/sysdeps/x86_64/multiarch/stpncpy-c.c b/sysdeps/x86_64/multiarch/stpncpy-c.c
deleted file mode 100644
index 2fde77dcab..0000000000
--- a/sysdeps/x86_64/multiarch/stpncpy-c.c
+++ /dev/null
@@ -1,8 +0,0 @@
-#define STPNCPY __stpncpy_sse2
-#ifdef SHARED
-#undef libc_hidden_def
-#define libc_hidden_def(name) \
- __hidden_ver1 (__stpncpy_sse2, __GI___stpncpy, __stpncpy_sse2);
-#endif
-
-#include "stpncpy.c"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S
deleted file mode 100644
index 658520f78f..0000000000
--- a/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY __stpncpy_sse2_unaligned
-#include "strcpy-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
deleted file mode 100644
index 14ed16f6b5..0000000000
--- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY __stpncpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy.S b/sysdeps/x86_64/multiarch/stpncpy.S
deleted file mode 100644
index 2698ca6a8c..0000000000
--- a/sysdeps/x86_64/multiarch/stpncpy.S
+++ /dev/null
@@ -1,8 +0,0 @@
-/* Multiple versions of stpncpy
- All versions must be listed in ifunc-impl-list.c. */
-#define STRCPY __stpncpy
-#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#include "strcpy.S"
-
-weak_alias (__stpncpy, stpncpy)
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S b/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
deleted file mode 100644
index fb2f9ae14a..0000000000
--- a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#define USE_SSSE3 1
-#define USE_AS_STRCASECMP_L
-#define NO_NOLOCALE_ALIAS
-#define STRCMP __strcasecmp_l_ssse3
-#define __strcasecmp __strcasecmp_ssse3
-#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l.S b/sysdeps/x86_64/multiarch/strcasecmp_l.S
deleted file mode 100644
index 49f5b9fd95..0000000000
--- a/sysdeps/x86_64/multiarch/strcasecmp_l.S
+++ /dev/null
@@ -1,8 +0,0 @@
-/* Multiple versions of strcasecmp and strcasecmp_l
- All versions must be listed in ifunc-impl-list.c. */
-#define STRCMP __strcasecmp_l
-#define USE_AS_STRCASECMP_L
-#include "strcmp.S"
-
-weak_alias (__strcasecmp_l, strcasecmp_l)
-libc_hidden_def (strcasecmp_l)
diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
deleted file mode 100644
index d0a8a1518a..0000000000
--- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
+++ /dev/null
@@ -1,279 +0,0 @@
-/* strcat with SSE2
- Copyright (C) 2011-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-# define STRCAT __strcat_sse2_unaligned
-# endif
-
-# define USE_AS_STRCAT
-
-.text
-ENTRY (STRCAT)
- mov %rdi, %r9
-# ifdef USE_AS_STRNCAT
- mov %rdx, %r8
-# endif
-
-/* Inline corresponding strlen file, temporary until new strcpy
- implementation gets merged. */
-
- xor %rax, %rax
- mov %edi, %ecx
- and $0x3f, %ecx
- pxor %xmm0, %xmm0
- cmp $0x30, %ecx
- ja L(next)
- movdqu (%rdi), %xmm1
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit_less16)
- mov %rdi, %rax
- and $-16, %rax
- jmp L(align16_start)
-L(next):
- mov %rdi, %rax
- and $-16, %rax
- pcmpeqb (%rax), %xmm0
- mov $-1, %r10d
- sub %rax, %rcx
- shl %cl, %r10d
- pmovmskb %xmm0, %edx
- and %r10d, %edx
- jnz L(exit)
-
-L(align16_start):
- pxor %xmm0, %xmm0
- pxor %xmm1, %xmm1
- pxor %xmm2, %xmm2
- pxor %xmm3, %xmm3
- pcmpeqb 16(%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit16)
-
- pcmpeqb 32(%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- jnz L(exit32)
-
- pcmpeqb 48(%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- jnz L(exit48)
-
- pcmpeqb 64(%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- jnz L(exit64)
-
- pcmpeqb 80(%rax), %xmm0
- add $64, %rax
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit16)
-
- pcmpeqb 32(%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- jnz L(exit32)
-
- pcmpeqb 48(%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- jnz L(exit48)
-
- pcmpeqb 64(%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- jnz L(exit64)
-
- pcmpeqb 80(%rax), %xmm0
- add $64, %rax
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit16)
-
- pcmpeqb 32(%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- jnz L(exit32)
-
- pcmpeqb 48(%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- jnz L(exit48)
-
- pcmpeqb 64(%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- jnz L(exit64)
-
- pcmpeqb 80(%rax), %xmm0
- add $64, %rax
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit16)
-
- pcmpeqb 32(%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- jnz L(exit32)
-
- pcmpeqb 48(%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- jnz L(exit48)
-
- pcmpeqb 64(%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- jnz L(exit64)
-
- test $0x3f, %rax
- jz L(align64_loop)
-
- pcmpeqb 80(%rax), %xmm0
- add $80, %rax
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit)
-
- test $0x3f, %rax
- jz L(align64_loop)
-
- pcmpeqb 16(%rax), %xmm1
- add $16, %rax
- pmovmskb %xmm1, %edx
- test %edx, %edx
- jnz L(exit)
-
- test $0x3f, %rax
- jz L(align64_loop)
-
- pcmpeqb 16(%rax), %xmm2
- add $16, %rax
- pmovmskb %xmm2, %edx
- test %edx, %edx
- jnz L(exit)
-
- test $0x3f, %rax
- jz L(align64_loop)
-
- pcmpeqb 16(%rax), %xmm3
- add $16, %rax
- pmovmskb %xmm3, %edx
- test %edx, %edx
- jnz L(exit)
-
- add $16, %rax
- .p2align 4
- L(align64_loop):
- movaps (%rax), %xmm4
- pminub 16(%rax), %xmm4
- movaps 32(%rax), %xmm5
- pminub 48(%rax), %xmm5
- add $64, %rax
- pminub %xmm4, %xmm5
- pcmpeqb %xmm0, %xmm5
- pmovmskb %xmm5, %edx
- test %edx, %edx
- jz L(align64_loop)
-
- pcmpeqb -64(%rax), %xmm0
- sub $80, %rax
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit16)
-
- pcmpeqb 32(%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- jnz L(exit32)
-
- pcmpeqb 48(%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- jnz L(exit48)
-
- pcmpeqb 64(%rax), %xmm3
- pmovmskb %xmm3, %edx
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $64, %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit):
- sub %rdi, %rax
-L(exit_less16):
- bsf %rdx, %rdx
- add %rdx, %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit16):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $16, %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit32):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $32, %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit48):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $48, %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit64):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $64, %rax
-
- .p2align 4
-L(StartStrcpyPart):
- lea (%r9, %rax), %rdi
- mov %rsi, %rcx
- mov %r9, %rax /* save result */
-
-# ifdef USE_AS_STRNCAT
- test %r8, %r8
- jz L(ExitZero)
-# define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-sse2-unaligned.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S
deleted file mode 100644
index edd683d778..0000000000
--- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
+++ /dev/null
@@ -1,867 +0,0 @@
-/* strcat with SSSE3
- Copyright (C) 2011-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-# define STRCAT __strcat_ssse3
-# endif
-
-# define USE_AS_STRCAT
-
-.text
-ENTRY (STRCAT)
-# ifdef USE_AS_STRNCAT
- mov %rdx, %r8
-# endif
-
-
-/* Inline corresponding strlen file, temporary until new strcpy
- implementation gets merged. */
-
- xor %eax, %eax
- cmpb $0, (%rdi)
- jz L(exit_tail0)
- cmpb $0, 1(%rdi)
- jz L(exit_tail1)
- cmpb $0, 2(%rdi)
- jz L(exit_tail2)
- cmpb $0, 3(%rdi)
- jz L(exit_tail3)
-
- cmpb $0, 4(%rdi)
- jz L(exit_tail4)
- cmpb $0, 5(%rdi)
- jz L(exit_tail5)
- cmpb $0, 6(%rdi)
- jz L(exit_tail6)
- cmpb $0, 7(%rdi)
- jz L(exit_tail7)
-
- cmpb $0, 8(%rdi)
- jz L(exit_tail8)
- cmpb $0, 9(%rdi)
- jz L(exit_tail9)
- cmpb $0, 10(%rdi)
- jz L(exit_tail10)
- cmpb $0, 11(%rdi)
- jz L(exit_tail11)
-
- cmpb $0, 12(%rdi)
- jz L(exit_tail12)
- cmpb $0, 13(%rdi)
- jz L(exit_tail13)
- cmpb $0, 14(%rdi)
- jz L(exit_tail14)
- cmpb $0, 15(%rdi)
- jz L(exit_tail15)
- pxor %xmm0, %xmm0
- lea 16(%rdi), %rcx
- lea 16(%rdi), %rax
- and $-16, %rax
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- pxor %xmm1, %xmm1
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- pxor %xmm2, %xmm2
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- pxor %xmm3, %xmm3
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- and $-0x40, %rax
-
- .p2align 4
-L(aligned_64):
- pcmpeqb (%rax), %xmm0
- pcmpeqb 16(%rax), %xmm1
- pcmpeqb 32(%rax), %xmm2
- pcmpeqb 48(%rax), %xmm3
- pmovmskb %xmm0, %edx
- pmovmskb %xmm1, %r11d
- pmovmskb %xmm2, %r10d
- pmovmskb %xmm3, %r9d
- or %edx, %r9d
- or %r11d, %r9d
- or %r10d, %r9d
- lea 64(%rax), %rax
- jz L(aligned_64)
-
- test %edx, %edx
- jnz L(aligned_64_exit_16)
- test %r11d, %r11d
- jnz L(aligned_64_exit_32)
- test %r10d, %r10d
- jnz L(aligned_64_exit_48)
-
-L(aligned_64_exit_64):
- pmovmskb %xmm3, %edx
- jmp L(exit)
-
-L(aligned_64_exit_48):
- lea -16(%rax), %rax
- mov %r10d, %edx
- jmp L(exit)
-
-L(aligned_64_exit_32):
- lea -32(%rax), %rax
- mov %r11d, %edx
- jmp L(exit)
-
-L(aligned_64_exit_16):
- lea -48(%rax), %rax
-
-L(exit):
- sub %rcx, %rax
- test %dl, %dl
- jz L(exit_high)
- test $0x01, %dl
- jnz L(exit_tail0)
-
- test $0x02, %dl
- jnz L(exit_tail1)
-
- test $0x04, %dl
- jnz L(exit_tail2)
-
- test $0x08, %dl
- jnz L(exit_tail3)
-
- test $0x10, %dl
- jnz L(exit_tail4)
-
- test $0x20, %dl
- jnz L(exit_tail5)
-
- test $0x40, %dl
- jnz L(exit_tail6)
- add $7, %eax
-L(exit_tail0):
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_high):
- add $8, %eax
- test $0x01, %dh
- jnz L(exit_tail0)
-
- test $0x02, %dh
- jnz L(exit_tail1)
-
- test $0x04, %dh
- jnz L(exit_tail2)
-
- test $0x08, %dh
- jnz L(exit_tail3)
-
- test $0x10, %dh
- jnz L(exit_tail4)
-
- test $0x20, %dh
- jnz L(exit_tail5)
-
- test $0x40, %dh
- jnz L(exit_tail6)
- add $7, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail1):
- add $1, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail2):
- add $2, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail3):
- add $3, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail4):
- add $4, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail5):
- add $5, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail6):
- add $6, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail7):
- add $7, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail8):
- add $8, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail9):
- add $9, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail10):
- add $10, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail11):
- add $11, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail12):
- add $12, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail13):
- add $13, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail14):
- add $14, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail15):
- add $15, %eax
-
- .p2align 4
-L(StartStrcpyPart):
- mov %rsi, %rcx
- lea (%rdi, %rax), %rdx
-# ifdef USE_AS_STRNCAT
- test %r8, %r8
- jz L(StrncatExit0)
- cmp $8, %r8
- jbe L(StrncatExit8Bytes)
-# endif
- cmpb $0, (%rcx)
- jz L(Exit1)
- cmpb $0, 1(%rcx)
- jz L(Exit2)
- cmpb $0, 2(%rcx)
- jz L(Exit3)
- cmpb $0, 3(%rcx)
- jz L(Exit4)
- cmpb $0, 4(%rcx)
- jz L(Exit5)
- cmpb $0, 5(%rcx)
- jz L(Exit6)
- cmpb $0, 6(%rcx)
- jz L(Exit7)
- cmpb $0, 7(%rcx)
- jz L(Exit8)
- cmpb $0, 8(%rcx)
- jz L(Exit9)
-# ifdef USE_AS_STRNCAT
- cmp $16, %r8
- jb L(StrncatExit15Bytes)
-# endif
- cmpb $0, 9(%rcx)
- jz L(Exit10)
- cmpb $0, 10(%rcx)
- jz L(Exit11)
- cmpb $0, 11(%rcx)
- jz L(Exit12)
- cmpb $0, 12(%rcx)
- jz L(Exit13)
- cmpb $0, 13(%rcx)
- jz L(Exit14)
- cmpb $0, 14(%rcx)
- jz L(Exit15)
- cmpb $0, 15(%rcx)
- jz L(Exit16)
-# ifdef USE_AS_STRNCAT
- cmp $16, %r8
- je L(StrncatExit16)
-# define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-ssse3.S"
-
- .p2align 4
-L(CopyFrom1To16Bytes):
- add %rsi, %rdx
- add %rsi, %rcx
-
- test %al, %al
- jz L(ExitHigh)
- test $0x01, %al
- jnz L(Exit1)
- test $0x02, %al
- jnz L(Exit2)
- test $0x04, %al
- jnz L(Exit3)
- test $0x08, %al
- jnz L(Exit4)
- test $0x10, %al
- jnz L(Exit5)
- test $0x20, %al
- jnz L(Exit6)
- test $0x40, %al
- jnz L(Exit7)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(ExitHigh):
- test $0x01, %ah
- jnz L(Exit9)
- test $0x02, %ah
- jnz L(Exit10)
- test $0x04, %ah
- jnz L(Exit11)
- test $0x08, %ah
- jnz L(Exit12)
- test $0x10, %ah
- jnz L(Exit13)
- test $0x20, %ah
- jnz L(Exit14)
- test $0x40, %ah
- jnz L(Exit15)
- movlpd (%rcx), %xmm0
- movlpd 8(%rcx), %xmm1
- movlpd %xmm0, (%rdx)
- movlpd %xmm1, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit1):
- xor %ah, %ah
- movb %ah, 1(%rdx)
-L(Exit1):
- movb (%rcx), %al
- movb %al, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit2):
- xor %ah, %ah
- movb %ah, 2(%rdx)
-L(Exit2):
- movw (%rcx), %ax
- movw %ax, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit3):
- xor %ah, %ah
- movb %ah, 3(%rdx)
-L(Exit3):
- movw (%rcx), %ax
- movw %ax, (%rdx)
- movb 2(%rcx), %al
- movb %al, 2(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit4):
- xor %ah, %ah
- movb %ah, 4(%rdx)
-L(Exit4):
- mov (%rcx), %eax
- mov %eax, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit5):
- xor %ah, %ah
- movb %ah, 5(%rdx)
-L(Exit5):
- mov (%rcx), %eax
- mov %eax, (%rdx)
- movb 4(%rcx), %al
- movb %al, 4(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit6):
- xor %ah, %ah
- movb %ah, 6(%rdx)
-L(Exit6):
- mov (%rcx), %eax
- mov %eax, (%rdx)
- movw 4(%rcx), %ax
- movw %ax, 4(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit7):
- xor %ah, %ah
- movb %ah, 7(%rdx)
-L(Exit7):
- mov (%rcx), %eax
- mov %eax, (%rdx)
- mov 3(%rcx), %eax
- mov %eax, 3(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit8):
- xor %ah, %ah
- movb %ah, 8(%rdx)
-L(Exit8):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit9):
- xor %ah, %ah
- movb %ah, 9(%rdx)
-L(Exit9):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movb 8(%rcx), %al
- movb %al, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit10):
- xor %ah, %ah
- movb %ah, 10(%rdx)
-L(Exit10):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movw 8(%rcx), %ax
- movw %ax, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit11):
- xor %ah, %ah
- movb %ah, 11(%rdx)
-L(Exit11):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- mov 7(%rcx), %eax
- mov %eax, 7(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit12):
- xor %ah, %ah
- movb %ah, 12(%rdx)
-L(Exit12):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- mov 8(%rcx), %eax
- mov %eax, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit13):
- xor %ah, %ah
- movb %ah, 13(%rdx)
-L(Exit13):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 5(%rcx), %xmm1
- movlpd %xmm1, 5(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit14):
- xor %ah, %ah
- movb %ah, 14(%rdx)
-L(Exit14):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 6(%rcx), %xmm1
- movlpd %xmm1, 6(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit15):
- xor %ah, %ah
- movb %ah, 15(%rdx)
-L(Exit15):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 7(%rcx), %xmm1
- movlpd %xmm1, 7(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit16):
- xor %ah, %ah
- movb %ah, 16(%rdx)
-L(Exit16):
- movlpd (%rcx), %xmm0
- movlpd 8(%rcx), %xmm1
- movlpd %xmm0, (%rdx)
- movlpd %xmm1, 8(%rdx)
- mov %rdi, %rax
- ret
-
-# ifdef USE_AS_STRNCPY
-
- .p2align 4
-L(CopyFrom1To16BytesCase2):
- add $16, %r8
- add %rsi, %rcx
- lea (%rsi, %rdx), %rsi
- lea -9(%r8), %rdx
- and $1<<7, %dh
- or %al, %dh
- test %dh, %dh
- lea (%rsi), %rdx
- jz L(ExitHighCase2)
-
- test $0x01, %al
- jnz L(Exit1)
- cmp $1, %r8
- je L(StrncatExit1)
- test $0x02, %al
- jnz L(Exit2)
- cmp $2, %r8
- je L(StrncatExit2)
- test $0x04, %al
- jnz L(Exit3)
- cmp $3, %r8
- je L(StrncatExit3)
- test $0x08, %al
- jnz L(Exit4)
- cmp $4, %r8
- je L(StrncatExit4)
- test $0x10, %al
- jnz L(Exit5)
- cmp $5, %r8
- je L(StrncatExit5)
- test $0x20, %al
- jnz L(Exit6)
- cmp $6, %r8
- je L(StrncatExit6)
- test $0x40, %al
- jnz L(Exit7)
- cmp $7, %r8
- je L(StrncatExit7)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- lea 7(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
- xor %cl, %cl
- movb %cl, (%rax)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(ExitHighCase2):
- test $0x01, %ah
- jnz L(Exit9)
- cmp $9, %r8
- je L(StrncatExit9)
- test $0x02, %ah
- jnz L(Exit10)
- cmp $10, %r8
- je L(StrncatExit10)
- test $0x04, %ah
- jnz L(Exit11)
- cmp $11, %r8
- je L(StrncatExit11)
- test $0x8, %ah
- jnz L(Exit12)
- cmp $12, %r8
- je L(StrncatExit12)
- test $0x10, %ah
- jnz L(Exit13)
- cmp $13, %r8
- je L(StrncatExit13)
- test $0x20, %ah
- jnz L(Exit14)
- cmp $14, %r8
- je L(StrncatExit14)
- test $0x40, %ah
- jnz L(Exit15)
- cmp $15, %r8
- je L(StrncatExit15)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 8(%rcx), %xmm1
- movlpd %xmm1, 8(%rdx)
- mov %rdi, %rax
- ret
-
-L(CopyFrom1To16BytesCase2OrCase3):
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
-
- .p2align 4
-L(CopyFrom1To16BytesCase3):
- add $16, %r8
- add %rsi, %rdx
- add %rsi, %rcx
-
- cmp $8, %r8
- ja L(ExitHighCase3)
- cmp $1, %r8
- je L(StrncatExit1)
- cmp $2, %r8
- je L(StrncatExit2)
- cmp $3, %r8
- je L(StrncatExit3)
- cmp $4, %r8
- je L(StrncatExit4)
- cmp $5, %r8
- je L(StrncatExit5)
- cmp $6, %r8
- je L(StrncatExit6)
- cmp $7, %r8
- je L(StrncatExit7)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- xor %ah, %ah
- movb %ah, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(ExitHighCase3):
- cmp $9, %r8
- je L(StrncatExit9)
- cmp $10, %r8
- je L(StrncatExit10)
- cmp $11, %r8
- je L(StrncatExit11)
- cmp $12, %r8
- je L(StrncatExit12)
- cmp $13, %r8
- je L(StrncatExit13)
- cmp $14, %r8
- je L(StrncatExit14)
- cmp $15, %r8
- je L(StrncatExit15)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 8(%rcx), %xmm1
- movlpd %xmm1, 8(%rdx)
- xor %ah, %ah
- movb %ah, 16(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit0):
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit15Bytes):
- cmp $9, %r8
- je L(StrncatExit9)
- cmpb $0, 9(%rcx)
- jz L(Exit10)
- cmp $10, %r8
- je L(StrncatExit10)
- cmpb $0, 10(%rcx)
- jz L(Exit11)
- cmp $11, %r8
- je L(StrncatExit11)
- cmpb $0, 11(%rcx)
- jz L(Exit12)
- cmp $12, %r8
- je L(StrncatExit12)
- cmpb $0, 12(%rcx)
- jz L(Exit13)
- cmp $13, %r8
- je L(StrncatExit13)
- cmpb $0, 13(%rcx)
- jz L(Exit14)
- cmp $14, %r8
- je L(StrncatExit14)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 7(%rcx), %xmm1
- movlpd %xmm1, 7(%rdx)
- lea 14(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
- xor %cl, %cl
- movb %cl, (%rax)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit8Bytes):
- cmpb $0, (%rcx)
- jz L(Exit1)
- cmp $1, %r8
- je L(StrncatExit1)
- cmpb $0, 1(%rcx)
- jz L(Exit2)
- cmp $2, %r8
- je L(StrncatExit2)
- cmpb $0, 2(%rcx)
- jz L(Exit3)
- cmp $3, %r8
- je L(StrncatExit3)
- cmpb $0, 3(%rcx)
- jz L(Exit4)
- cmp $4, %r8
- je L(StrncatExit4)
- cmpb $0, 4(%rcx)
- jz L(Exit5)
- cmp $5, %r8
- je L(StrncatExit5)
- cmpb $0, 5(%rcx)
- jz L(Exit6)
- cmp $6, %r8
- je L(StrncatExit6)
- cmpb $0, 6(%rcx)
- jz L(Exit7)
- cmp $7, %r8
- je L(StrncatExit7)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- lea 7(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
- xor %cl, %cl
- movb %cl, (%rax)
- mov %rdi, %rax
- ret
-
-# endif
-END (STRCAT)
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcat.S b/sysdeps/x86_64/multiarch/strcat.S
deleted file mode 100644
index 0e0e5dda9c..0000000000
--- a/sysdeps/x86_64/multiarch/strcat.S
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Multiple versions of strcat
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2009-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-#ifndef USE_AS_STRNCAT
-# ifndef STRCAT
-# define STRCAT strcat
-# endif
-#endif
-
-#ifdef USE_AS_STRNCAT
-# define STRCAT_SSSE3 __strncat_ssse3
-# define STRCAT_SSE2 __strncat_sse2
-# define STRCAT_SSE2_UNALIGNED __strncat_sse2_unaligned
-# define __GI_STRCAT __GI_strncat
-# define __GI___STRCAT __GI___strncat
-#else
-# define STRCAT_SSSE3 __strcat_ssse3
-# define STRCAT_SSE2 __strcat_sse2
-# define STRCAT_SSE2_UNALIGNED __strcat_sse2_unaligned
-# define __GI_STRCAT __GI_strcat
-# define __GI___STRCAT __GI___strcat
-#endif
-
-
-/* Define multiple versions only for the definition in libc. */
-#if IS_IN (libc)
- .text
-ENTRY(STRCAT)
- .type STRCAT, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- leaq STRCAT_SSE2_UNALIGNED(%rip), %rax
- HAS_ARCH_FEATURE (Fast_Unaligned_Load)
- jnz 2f
- leaq STRCAT_SSE2(%rip), %rax
- HAS_CPU_FEATURE (SSSE3)
- jz 2f
- leaq STRCAT_SSSE3(%rip), %rax
-2: ret
-END(STRCAT)
-
-# undef ENTRY
-# define ENTRY(name) \
- .type STRCAT_SSE2, @function; \
- .align 16; \
- .globl STRCAT_SSE2; \
- .hidden STRCAT_SSE2; \
- STRCAT_SSE2: cfi_startproc; \
- CALL_MCOUNT
-# undef END
-# define END(name) \
- cfi_endproc; .size STRCAT_SSE2, .-STRCAT_SSE2
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal strcat calls through a PLT.
- The speedup we get from using SSSE3 instruction is likely eaten away
- by the indirect call in the PLT. */
-# define libc_hidden_builtin_def(name) \
- .globl __GI_STRCAT; __GI_STRCAT = STRCAT_SSE2
-# undef libc_hidden_def
-# define libc_hidden_def(name) \
- .globl __GI___STRCAT; __GI___STRCAT = STRCAT_SSE2
-#endif
-
-#ifndef USE_AS_STRNCAT
-# include "../strcat.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S
deleted file mode 100644
index cbbd0b33d3..0000000000
--- a/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S
+++ /dev/null
@@ -1,280 +0,0 @@
-/* strchr with SSE2 without bsf
- Copyright (C) 2011-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-# include "asm-syntax.h"
-
- atom_text_section
-ENTRY (__strchr_sse2_no_bsf)
- movd %esi, %xmm1
- movq %rdi, %rcx
- punpcklbw %xmm1, %xmm1
- andq $~15, %rdi
- pxor %xmm2, %xmm2
- punpcklbw %xmm1, %xmm1
- orl $0xffffffff, %esi
- movdqa (%rdi), %xmm0
- pshufd $0, %xmm1, %xmm1
- subq %rdi, %rcx
- movdqa %xmm0, %xmm3
- leaq 16(%rdi), %rdi
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm3
- shl %cl, %esi
- pmovmskb %xmm0, %eax
- pmovmskb %xmm3, %edx
- andl %esi, %eax
- andl %esi, %edx
- test %eax, %eax
- jnz L(matches)
- test %edx, %edx
- jnz L(return_null)
-
-L(loop):
- movdqa (%rdi), %xmm0
- leaq 16(%rdi), %rdi
- movdqa %xmm0, %xmm3
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm3
- pmovmskb %xmm0, %eax
- pmovmskb %xmm3, %edx
- or %eax, %edx
- jz L(loop)
-
- pmovmskb %xmm3, %edx
- test %eax, %eax
- jnz L(matches)
-
-/* Return NULL. */
- .p2align 4
-L(return_null):
- xor %rax, %rax
- ret
-
-L(matches):
- /* There is a match. First find where NULL is. */
- leaq -16(%rdi), %rdi
- test %edx, %edx
- jz L(match_case1)
-
- .p2align 4
-L(match_case2):
- test %al, %al
- jz L(match_high_case2)
-
- mov %al, %cl
- and $15, %cl
- jnz L(match_case2_4)
-
- mov %dl, %ch
- and $15, %ch
- jnz L(return_null)
-
- test $0x10, %al
- jnz L(Exit5)
- test $0x10, %dl
- jnz L(return_null)
- test $0x20, %al
- jnz L(Exit6)
- test $0x20, %dl
- jnz L(return_null)
- test $0x40, %al
- jnz L(Exit7)
- test $0x40, %dl
- jnz L(return_null)
- lea 7(%rdi), %rax
- ret
-
- .p2align 4
-L(match_case2_4):
- test $0x01, %al
- jnz L(Exit1)
- test $0x01, %dl
- jnz L(return_null)
- test $0x02, %al
- jnz L(Exit2)
- test $0x02, %dl
- jnz L(return_null)
- test $0x04, %al
- jnz L(Exit3)
- test $0x04, %dl
- jnz L(return_null)
- lea 3(%rdi), %rax
- ret
-
- .p2align 4
-L(match_high_case2):
- test %dl, %dl
- jnz L(return_null)
-
- mov %ah, %cl
- and $15, %cl
- jnz L(match_case2_12)
-
- mov %dh, %ch
- and $15, %ch
- jnz L(return_null)
-
- test $0x10, %ah
- jnz L(Exit13)
- test $0x10, %dh
- jnz L(return_null)
- test $0x20, %ah
- jnz L(Exit14)
- test $0x20, %dh
- jnz L(return_null)
- test $0x40, %ah
- jnz L(Exit15)
- test $0x40, %dh
- jnz L(return_null)
- lea 15(%rdi), %rax
- ret
-
- .p2align 4
-L(match_case2_12):
- test $0x01, %ah
- jnz L(Exit9)
- test $0x01, %dh
- jnz L(return_null)
- test $0x02, %ah
- jnz L(Exit10)
- test $0x02, %dh
- jnz L(return_null)
- test $0x04, %ah
- jnz L(Exit11)
- test $0x04, %dh
- jnz L(return_null)
- lea 11(%rdi), %rax
- ret
-
- .p2align 4
-L(match_case1):
- test %al, %al
- jz L(match_high_case1)
-
- test $0x01, %al
- jnz L(Exit1)
- test $0x02, %al
- jnz L(Exit2)
- test $0x04, %al
- jnz L(Exit3)
- test $0x08, %al
- jnz L(Exit4)
- test $0x10, %al
- jnz L(Exit5)
- test $0x20, %al
- jnz L(Exit6)
- test $0x40, %al
- jnz L(Exit7)
- lea 7(%rdi), %rax
- ret
-
- .p2align 4
-L(match_high_case1):
- test $0x01, %ah
- jnz L(Exit9)
- test $0x02, %ah
- jnz L(Exit10)
- test $0x04, %ah
- jnz L(Exit11)
- test $0x08, %ah
- jnz L(Exit12)
- test $0x10, %ah
- jnz L(Exit13)
- test $0x20, %ah
- jnz L(Exit14)
- test $0x40, %ah
- jnz L(Exit15)
- lea 15(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit1):
- lea (%rdi), %rax
- ret
-
- .p2align 4
-L(Exit2):
- lea 1(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit3):
- lea 2(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit4):
- lea 3(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit5):
- lea 4(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit6):
- lea 5(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit7):
- lea 6(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit9):
- lea 8(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit10):
- lea 9(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit11):
- lea 10(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit12):
- lea 11(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit13):
- lea 12(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit14):
- lea 13(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit15):
- lea 14(%rdi), %rax
- ret
-
-END (__strchr_sse2_no_bsf)
-#endif
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
deleted file mode 100644
index c9f54ca2e2..0000000000
--- a/sysdeps/x86_64/multiarch/strchr.S
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Multiple versions of strchr
- Copyright (C) 2009-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-
-/* Define multiple versions only for the definition in libc. */
-#if IS_IN (libc)
- .text
-ENTRY(strchr)
- .type strchr, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- leaq __strchr_sse2(%rip), %rax
-2: HAS_ARCH_FEATURE (Slow_BSF)
- jz 3f
- leaq __strchr_sse2_no_bsf(%rip), %rax
-3: ret
-END(strchr)
-
-
-
-# undef ENTRY
-# define ENTRY(name) \
- .type __strchr_sse2, @function; \
- .align 16; \
- .globl __strchr_sse2; \
- .hidden __strchr_sse2; \
- __strchr_sse2: cfi_startproc; \
- CALL_MCOUNT
-# undef END
-# define END(name) \
- cfi_endproc; .size __strchr_sse2, .-__strchr_sse2
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal strchr calls through a PLT.
- The speedup we get from using SSE4.2 instruction is likely eaten away
- by the indirect call in the PLT. */
-# define libc_hidden_builtin_def(name) \
- .globl __GI_strchr; __GI_strchr = __strchr_sse2
-#endif
-
-#include "../strchr.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
deleted file mode 100644
index b0992dce39..0000000000
--- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
+++ /dev/null
@@ -1,213 +0,0 @@
-/* strcmp with unaligned loads
- Copyright (C) 2013-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-#include "sysdep.h"
-
-ENTRY ( __strcmp_sse2_unaligned)
- movl %edi, %eax
- xorl %edx, %edx
- pxor %xmm7, %xmm7
- orl %esi, %eax
- andl $4095, %eax
- cmpl $4032, %eax
- jg L(cross_page)
- movdqu (%rdi), %xmm1
- movdqu (%rsi), %xmm0
- pcmpeqb %xmm1, %xmm0
- pminub %xmm1, %xmm0
- pxor %xmm1, %xmm1
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %eax
- testq %rax, %rax
- je L(next_48_bytes)
-L(return):
- bsfq %rax, %rdx
- movzbl (%rdi, %rdx), %eax
- movzbl (%rsi, %rdx), %edx
- subl %edx, %eax
- ret
-
- .p2align 4
-L(next_48_bytes):
- movdqu 16(%rdi), %xmm6
- movdqu 16(%rsi), %xmm3
- movdqu 32(%rdi), %xmm5
- pcmpeqb %xmm6, %xmm3
- movdqu 32(%rsi), %xmm2
- pminub %xmm6, %xmm3
- pcmpeqb %xmm1, %xmm3
- movdqu 48(%rdi), %xmm4
- pcmpeqb %xmm5, %xmm2
- pmovmskb %xmm3, %edx
- movdqu 48(%rsi), %xmm0
- pminub %xmm5, %xmm2
- pcmpeqb %xmm1, %xmm2
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm2, %eax
- salq $16, %rdx
- pminub %xmm4, %xmm0
- pcmpeqb %xmm1, %xmm0
- salq $32, %rax
- orq %rdx, %rax
- pmovmskb %xmm0, %ecx
- movq %rcx, %rdx
- salq $48, %rdx
- orq %rdx, %rax
- jne L(return)
-L(main_loop_header):
- leaq 64(%rdi), %rdx
- movl $4096, %ecx
- pxor %xmm9, %xmm9
- andq $-64, %rdx
- subq %rdi, %rdx
- leaq (%rdi, %rdx), %rax
- addq %rsi, %rdx
- movq %rdx, %rsi
- andl $4095, %esi
- subq %rsi, %rcx
- shrq $6, %rcx
- movq %rcx, %rsi
- jmp L(loop_start)
-
- .p2align 4
-L(loop):
- addq $64, %rax
- addq $64, %rdx
-L(loop_start):
- testq %rsi, %rsi
- leaq -1(%rsi), %rsi
- je L(loop_cross_page)
-L(back_to_loop):
- movdqu (%rdx), %xmm0
- movdqu 16(%rdx), %xmm1
- movdqa (%rax), %xmm2
- movdqa 16(%rax), %xmm3
- pcmpeqb %xmm2, %xmm0
- movdqu 32(%rdx), %xmm5
- pcmpeqb %xmm3, %xmm1
- pminub %xmm2, %xmm0
- movdqu 48(%rdx), %xmm6
- pminub %xmm3, %xmm1
- movdqa 32(%rax), %xmm2
- pminub %xmm1, %xmm0
- movdqa 48(%rax), %xmm3
- pcmpeqb %xmm2, %xmm5
- pcmpeqb %xmm3, %xmm6
- pminub %xmm2, %xmm5
- pminub %xmm3, %xmm6
- pminub %xmm5, %xmm0
- pminub %xmm6, %xmm0
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm0, %ecx
- testl %ecx, %ecx
- je L(loop)
- pcmpeqb %xmm7, %xmm5
- movdqu (%rdx), %xmm0
- pcmpeqb %xmm7, %xmm1
- movdqa (%rax), %xmm2
- pcmpeqb %xmm2, %xmm0
- pminub %xmm2, %xmm0
- pcmpeqb %xmm7, %xmm6
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm1, %ecx
- pmovmskb %xmm5, %r8d
- pmovmskb %xmm0, %edi
- salq $16, %rcx
- salq $32, %r8
- pmovmskb %xmm6, %esi
- orq %r8, %rcx
- orq %rdi, %rcx
- salq $48, %rsi
- orq %rsi, %rcx
- bsfq %rcx, %rcx
- movzbl (%rax, %rcx), %eax
- movzbl (%rdx, %rcx), %edx
- subl %edx, %eax
- ret
-
- .p2align 4
-L(loop_cross_page):
- xor %r10, %r10
- movq %rdx, %r9
- and $63, %r9
- subq %r9, %r10
-
- movdqa (%rdx, %r10), %xmm0
- movdqa 16(%rdx, %r10), %xmm1
- movdqu (%rax, %r10), %xmm2
- movdqu 16(%rax, %r10), %xmm3
- pcmpeqb %xmm2, %xmm0
- movdqa 32(%rdx, %r10), %xmm5
- pcmpeqb %xmm3, %xmm1
- pminub %xmm2, %xmm0
- movdqa 48(%rdx, %r10), %xmm6
- pminub %xmm3, %xmm1
- movdqu 32(%rax, %r10), %xmm2
- movdqu 48(%rax, %r10), %xmm3
- pcmpeqb %xmm2, %xmm5
- pcmpeqb %xmm3, %xmm6
- pminub %xmm2, %xmm5
- pminub %xmm3, %xmm6
-
- pcmpeqb %xmm7, %xmm0
- pcmpeqb %xmm7, %xmm1
- pcmpeqb %xmm7, %xmm5
- pcmpeqb %xmm7, %xmm6
-
- pmovmskb %xmm1, %ecx
- pmovmskb %xmm5, %r8d
- pmovmskb %xmm0, %edi
- salq $16, %rcx
- salq $32, %r8
- pmovmskb %xmm6, %esi
- orq %r8, %rdi
- orq %rcx, %rdi
- salq $48, %rsi
- orq %rsi, %rdi
- movq %r9, %rcx
- movq $63, %rsi
- shrq %cl, %rdi
- test %rdi, %rdi
- je L(back_to_loop)
- bsfq %rdi, %rcx
- movzbl (%rax, %rcx), %eax
- movzbl (%rdx, %rcx), %edx
- subl %edx, %eax
- ret
-
- .p2align 4
-L(cross_page_loop):
- cmpb %cl, %al
- jne L(different)
- addq $1, %rdx
- cmpq $64, %rdx
- je L(main_loop_header)
-L(cross_page):
- movzbl (%rdi, %rdx), %eax
- movzbl (%rsi, %rdx), %ecx
- testb %al, %al
- jne L(cross_page_loop)
- xorl %eax, %eax
-L(different):
- subl %ecx, %eax
- ret
-END (__strcmp_sse2_unaligned)
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
deleted file mode 100644
index ed26d4a8fb..0000000000
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
+++ /dev/null
@@ -1,1792 +0,0 @@
-/* strcmp with SSE4.2
- Copyright (C) 2009-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-
-/* We use 0x1a:
- _SIDD_SBYTE_OPS
- | _SIDD_CMP_EQUAL_EACH
- | _SIDD_NEGATIVE_POLARITY
- | _SIDD_LEAST_SIGNIFICANT
- on pcmpistri to find out if two 16byte data elements are the same
- and the offset of the first different byte. There are 4 cases:
-
- 1. Both 16byte data elements are valid and identical.
- 2. Both 16byte data elements have EOS and identical.
- 3. Both 16byte data elements are valid and they differ at offset X.
- 4. At least one 16byte data element has EOS at offset X. Two 16byte
- data elements must differ at or before offset X.
-
- Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
-
- case ECX CFlag ZFlag SFlag
- 1 16 0 0 0
- 2 16 0 1 1
- 3 X 1 0 0
- 4 0 <= X 1 0/1 0/1
-
- We exit from the loop for cases 2, 3 and 4 with jbe which branches
- when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for
- case 2. */
-
- /* Put all SSE 4.2 functions together. */
- .section .text.SECTION,"ax",@progbits
- .align 16
- .type STRCMP_SSE42, @function
- .globl STRCMP_SSE42
- .hidden STRCMP_SSE42
-#ifdef USE_AS_STRCASECMP_L
-ENTRY (GLABEL(__strcasecmp))
- movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
- mov %fs:(%rax),%RDX_LP
-
- // XXX 5 byte should be before the function
- /* 5-byte NOP. */
- .byte 0x0f,0x1f,0x44,0x00,0x00
-END (GLABEL(__strcasecmp))
- /* FALLTHROUGH to strcasecmp_l. */
-#endif
-#ifdef USE_AS_STRNCASECMP_L
-ENTRY (GLABEL(__strncasecmp))
- movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
- mov %fs:(%rax),%RCX_LP
-
- // XXX 5 byte should be before the function
- /* 5-byte NOP. */
- .byte 0x0f,0x1f,0x44,0x00,0x00
-END (GLABEL(__strncasecmp))
- /* FALLTHROUGH to strncasecmp_l. */
-#endif
-
-
-#ifdef USE_AVX
-# define movdqa vmovdqa
-# define movdqu vmovdqu
-# define pmovmskb vpmovmskb
-# define pcmpistri vpcmpistri
-# define psubb vpsubb
-# define pcmpeqb vpcmpeqb
-# define psrldq vpsrldq
-# define pslldq vpslldq
-# define palignr vpalignr
-# define pxor vpxor
-# define D(arg) arg, arg
-#else
-# define D(arg) arg
-#endif
-
-STRCMP_SSE42:
- cfi_startproc
- CALL_MCOUNT
-
-/*
- * This implementation uses SSE to compare up to 16 bytes at a time.
- */
-#ifdef USE_AS_STRCASECMP_L
- /* We have to fall back on the C implementation for locales
- with encodings not matching ASCII for single bytes. */
-# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
- mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
-# else
- mov (%rdx), %RAX_LP
-# endif
- testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
- jne __strcasecmp_l_nonascii
-#endif
-#ifdef USE_AS_STRNCASECMP_L
- /* We have to fall back on the C implementation for locales
- with encodings not matching ASCII for single bytes. */
-# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
- mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
-# else
- mov (%rcx), %RAX_LP
-# endif
- testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
- jne __strncasecmp_l_nonascii
-#endif
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- test %rdx, %rdx
- je LABEL(strcmp_exitz)
- cmp $1, %rdx
- je LABEL(Byte0)
- mov %rdx, %r11
-#endif
- mov %esi, %ecx
- mov %edi, %eax
-/* Use 64bit AND here to avoid long NOP padding. */
- and $0x3f, %rcx /* rsi alignment in cache line */
- and $0x3f, %rax /* rdi alignment in cache line */
-#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
- .section .rodata.cst16,"aM",@progbits,16
- .align 16
-LABEL(belowupper):
- .quad 0x4040404040404040
- .quad 0x4040404040404040
-LABEL(topupper):
-# ifdef USE_AVX
- .quad 0x5a5a5a5a5a5a5a5a
- .quad 0x5a5a5a5a5a5a5a5a
-# else
- .quad 0x5b5b5b5b5b5b5b5b
- .quad 0x5b5b5b5b5b5b5b5b
-# endif
-LABEL(touppermask):
- .quad 0x2020202020202020
- .quad 0x2020202020202020
- .previous
- movdqa LABEL(belowupper)(%rip), %xmm4
-# define UCLOW_reg %xmm4
- movdqa LABEL(topupper)(%rip), %xmm5
-# define UCHIGH_reg %xmm5
- movdqa LABEL(touppermask)(%rip), %xmm6
-# define LCQWORD_reg %xmm6
-#endif
- cmp $0x30, %ecx
- ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
- cmp $0x30, %eax
- ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */
- movdqu (%rdi), %xmm1
- movdqu (%rsi), %xmm2
-#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
-# ifdef USE_AVX
-# define TOLOWER(reg1, reg2) \
- vpcmpgtb UCLOW_reg, reg1, %xmm7; \
- vpcmpgtb UCHIGH_reg, reg1, %xmm8; \
- vpcmpgtb UCLOW_reg, reg2, %xmm9; \
- vpcmpgtb UCHIGH_reg, reg2, %xmm10; \
- vpandn %xmm7, %xmm8, %xmm8; \
- vpandn %xmm9, %xmm10, %xmm10; \
- vpand LCQWORD_reg, %xmm8, %xmm8; \
- vpand LCQWORD_reg, %xmm10, %xmm10; \
- vpor reg1, %xmm8, reg1; \
- vpor reg2, %xmm10, reg2
-# else
-# define TOLOWER(reg1, reg2) \
- movdqa reg1, %xmm7; \
- movdqa UCHIGH_reg, %xmm8; \
- movdqa reg2, %xmm9; \
- movdqa UCHIGH_reg, %xmm10; \
- pcmpgtb UCLOW_reg, %xmm7; \
- pcmpgtb reg1, %xmm8; \
- pcmpgtb UCLOW_reg, %xmm9; \
- pcmpgtb reg2, %xmm10; \
- pand %xmm8, %xmm7; \
- pand %xmm10, %xmm9; \
- pand LCQWORD_reg, %xmm7; \
- pand LCQWORD_reg, %xmm9; \
- por %xmm7, reg1; \
- por %xmm9, reg2
-# endif
- TOLOWER (%xmm1, %xmm2)
-#else
-# define TOLOWER(reg1, reg2)
-#endif
- pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */
- pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
- pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */
- psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
- jnz LABEL(less16bytes)/* If not, find different value or null char */
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)/* finish comparison */
-#endif
- add $16, %rsi /* prepare to search next 16 bytes */
- add $16, %rdi /* prepare to search next 16 bytes */
-
- /*
- * Determine source and destination string offsets from 16-byte
- * alignment. Use relative offset difference between the two to
- * determine which case below to use.
- */
- .p2align 4
-LABEL(crosscache):
- and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
- and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
- mov $0xffff, %edx /* for equivalent offset */
- xor %r8d, %r8d
- and $0xf, %ecx /* offset of rsi */
- and $0xf, %eax /* offset of rdi */
- pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */
- cmp %eax, %ecx
- je LABEL(ashr_0) /* rsi and rdi relative offset same */
- ja LABEL(bigger)
- mov %edx, %r8d /* r8d is offset flag for exit tail */
- xchg %ecx, %eax
- xchg %rsi, %rdi
-LABEL(bigger):
- movdqa (%rdi), %xmm2
- movdqa (%rsi), %xmm1
- lea 15(%rax), %r9
- sub %rcx, %r9
- lea LABEL(unaligned_table)(%rip), %r10
- movslq (%r10, %r9,4), %r9
- pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
- lea (%r10, %r9), %r10
- jmp *%r10 /* jump to corresponding case */
-
-/*
- * The following cases will be handled by ashr_0
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(0~15) n(0~15) 15(15+ n-n) ashr_0
- */
- .p2align 4
-LABEL(ashr_0):
-
- movdqa (%rsi), %xmm1
- pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */
-#else
- movdqa (%rdi), %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */
-#endif
- psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
- pmovmskb %xmm1, %r9d
- shr %cl, %edx /* adjust 0xffff for offset */
- shr %cl, %r9d /* adjust for 16-byte offset */
- sub %r9d, %edx
- /*
- * edx must be the same with r9d if in left byte (16-rcx) is equal to
- * the start from (16-rax) and no null char was seen.
- */
- jne LABEL(less32bytes) /* mismatch or null char */
- UPDATE_STRNCMP_COUNTER
- mov $16, %rcx
- mov $16, %r9
-
- /*
- * Now both strings are aligned at 16-byte boundary. Loop over strings
- * checking 32-bytes per iteration.
- */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
- .p2align 4
-LABEL(ashr_0_use):
- movdqa (%rdi,%rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- lea 16(%rdx), %rdx
- jbe LABEL(ashr_0_exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- movdqa (%rdi,%rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- lea 16(%rdx), %rdx
- jbe LABEL(ashr_0_exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- jmp LABEL(ashr_0_use)
-
-
- .p2align 4
-LABEL(ashr_0_exit_use):
- jnc LABEL(strcmp_exitz)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub %rcx, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- lea -16(%rdx, %rcx), %rcx
- movzbl (%rdi, %rcx), %eax
- movzbl (%rsi, %rcx), %edx
-#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
- leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
- movl (%rcx,%rax,4), %eax
- movl (%rcx,%rdx,4), %edx
-#endif
- sub %edx, %eax
- ret
-
-
-
-/*
- * The following cases will be handled by ashr_1
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(15) n -15 0(15 +(n-15) - n) ashr_1
- */
- .p2align 4
-LABEL(ashr_1):
- pslldq $15, D(%xmm2) /* shift first string to align with second */
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */
- psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/
- pmovmskb %xmm2, %r9d
- shr %cl, %edx /* adjust 0xffff for offset */
- shr %cl, %r9d /* adjust for 16-byte offset */
- sub %r9d, %edx
- jnz LABEL(less32bytes) /* mismatch or null char seen */
- movdqa (%rdi), %xmm3
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads*/
- mov $1, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 1(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_1_use):
- add $16, %r10
- jg LABEL(nibble_ashr_1_use)
-
-LABEL(nibble_ashr_1_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $1, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_1_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $1, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_1_use)
-
- .p2align 4
-LABEL(nibble_ashr_1_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $1, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $14, %ecx
- ja LABEL(nibble_ashr_1_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_2
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
- */
- .p2align 4
-LABEL(ashr_2):
- pslldq $14, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $2, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 2(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_2_use):
- add $16, %r10
- jg LABEL(nibble_ashr_2_use)
-
-LABEL(nibble_ashr_2_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $2, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_2_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $2, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_2_use)
-
- .p2align 4
-LABEL(nibble_ashr_2_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $2, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $13, %ecx
- ja LABEL(nibble_ashr_2_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_3
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
- */
- .p2align 4
-LABEL(ashr_3):
- pslldq $13, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $3, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 3(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
-LABEL(loop_ashr_3_use):
- add $16, %r10
- jg LABEL(nibble_ashr_3_use)
-
-LABEL(nibble_ashr_3_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $3, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_3_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $3, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_3_use)
-
- .p2align 4
-LABEL(nibble_ashr_3_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $3, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $12, %ecx
- ja LABEL(nibble_ashr_3_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_4
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
- */
- .p2align 4
-LABEL(ashr_4):
- pslldq $12, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $4, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 4(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_4_use):
- add $16, %r10
- jg LABEL(nibble_ashr_4_use)
-
-LABEL(nibble_ashr_4_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $4, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_4_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $4, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_4_use)
-
- .p2align 4
-LABEL(nibble_ashr_4_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $4, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $11, %ecx
- ja LABEL(nibble_ashr_4_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_5
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
- */
- .p2align 4
-LABEL(ashr_5):
- pslldq $11, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $5, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 5(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_5_use):
- add $16, %r10
- jg LABEL(nibble_ashr_5_use)
-
-LABEL(nibble_ashr_5_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $5, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_5_use)
-
- movdqa (%rdi, %rdx), %xmm0
-
- palignr $5, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_5_use)
-
- .p2align 4
-LABEL(nibble_ashr_5_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $5, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $10, %ecx
- ja LABEL(nibble_ashr_5_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_6
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
- */
- .p2align 4
-LABEL(ashr_6):
- pslldq $10, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $6, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 6(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_6_use):
- add $16, %r10
- jg LABEL(nibble_ashr_6_use)
-
-LABEL(nibble_ashr_6_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $6, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_6_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $6, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_6_use)
-
- .p2align 4
-LABEL(nibble_ashr_6_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $6, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $9, %ecx
- ja LABEL(nibble_ashr_6_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_7
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
- */
- .p2align 4
-LABEL(ashr_7):
- pslldq $9, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $7, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 7(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_7_use):
- add $16, %r10
- jg LABEL(nibble_ashr_7_use)
-
-LABEL(nibble_ashr_7_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $7, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_7_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $7, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_7_use)
-
- .p2align 4
-LABEL(nibble_ashr_7_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $7, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $8, %ecx
- ja LABEL(nibble_ashr_7_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_8
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
- */
- .p2align 4
-LABEL(ashr_8):
- pslldq $8, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $8, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 8(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_8_use):
- add $16, %r10
- jg LABEL(nibble_ashr_8_use)
-
-LABEL(nibble_ashr_8_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $8, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_8_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $8, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_8_use)
-
- .p2align 4
-LABEL(nibble_ashr_8_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $8, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $7, %ecx
- ja LABEL(nibble_ashr_8_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_9
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
- */
- .p2align 4
-LABEL(ashr_9):
- pslldq $7, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $9, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 9(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_9_use):
- add $16, %r10
- jg LABEL(nibble_ashr_9_use)
-
-LABEL(nibble_ashr_9_restart_use):
- movdqa (%rdi, %rdx), %xmm0
-
- palignr $9, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_9_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $9, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_9_use)
-
- .p2align 4
-LABEL(nibble_ashr_9_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $9, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $6, %ecx
- ja LABEL(nibble_ashr_9_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_10
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
- */
- .p2align 4
-LABEL(ashr_10):
- pslldq $6, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $10, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 10(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_10_use):
- add $16, %r10
- jg LABEL(nibble_ashr_10_use)
-
-LABEL(nibble_ashr_10_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $10, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_10_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $10, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_10_use)
-
- .p2align 4
-LABEL(nibble_ashr_10_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $10, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $5, %ecx
- ja LABEL(nibble_ashr_10_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_11
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
- */
- .p2align 4
-LABEL(ashr_11):
- pslldq $5, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $11, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 11(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_11_use):
- add $16, %r10
- jg LABEL(nibble_ashr_11_use)
-
-LABEL(nibble_ashr_11_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $11, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_11_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $11, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_11_use)
-
- .p2align 4
-LABEL(nibble_ashr_11_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $11, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $4, %ecx
- ja LABEL(nibble_ashr_11_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_12
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
- */
- .p2align 4
-LABEL(ashr_12):
- pslldq $4, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $12, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 12(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_12_use):
- add $16, %r10
- jg LABEL(nibble_ashr_12_use)
-
-LABEL(nibble_ashr_12_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $12, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_12_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $12, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_12_use)
-
- .p2align 4
-LABEL(nibble_ashr_12_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $12, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $3, %ecx
- ja LABEL(nibble_ashr_12_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_13
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
- */
- .p2align 4
-LABEL(ashr_13):
- pslldq $3, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $13, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 13(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
-
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_13_use):
- add $16, %r10
- jg LABEL(nibble_ashr_13_use)
-
-LABEL(nibble_ashr_13_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $13, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_13_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $13, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_13_use)
-
- .p2align 4
-LABEL(nibble_ashr_13_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $13, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $2, %ecx
- ja LABEL(nibble_ashr_13_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_14
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
- */
- .p2align 4
-LABEL(ashr_14):
- pslldq $2, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $14, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 14(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
-
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_14_use):
- add $16, %r10
- jg LABEL(nibble_ashr_14_use)
-
-LABEL(nibble_ashr_14_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $14, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_14_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $14, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_14_use)
-
- .p2align 4
-LABEL(nibble_ashr_14_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $14, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $1, %ecx
- ja LABEL(nibble_ashr_14_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_15
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
- */
- .p2align 4
-LABEL(ashr_15):
- pslldq $1, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
-
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $15, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 15(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
-
- sub $0x1000, %r10 /* subtract 4K pagesize */
-
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_15_use):
- add $16, %r10
- jg LABEL(nibble_ashr_15_use)
-
-LABEL(nibble_ashr_15_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $15, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_15_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $15, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_15_use)
-
- .p2align 4
-LABEL(nibble_ashr_15_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $15, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $0, %ecx
- ja LABEL(nibble_ashr_15_restart_use)
-
-LABEL(nibble_ashr_exit_use):
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- .p2align 4
-LABEL(exit_use):
- jnc LABEL(strcmp_exitz)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub %rcx, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add %rcx, %rdx
- lea -16(%rdi, %r9), %rdi
- movzbl (%rdi, %rdx), %eax
- movzbl (%rsi, %rdx), %edx
- test %r8d, %r8d
- jz LABEL(ret_use)
- xchg %eax, %edx
-LABEL(ret_use):
-#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
- leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
- movl (%rcx,%rdx,4), %edx
- movl (%rcx,%rax,4), %eax
-#endif
-
- sub %edx, %eax
- ret
-
-LABEL(less32bytes):
- lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
- lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
- test %r8d, %r8d
- jz LABEL(ret)
- xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
-
- .p2align 4
-LABEL(ret):
-LABEL(less16bytes):
- bsf %rdx, %rdx /* find and store bit index in %rdx */
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub %rdx, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- movzbl (%rsi, %rdx), %ecx
- movzbl (%rdi, %rdx), %eax
-
-#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
- leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
- movl (%rdx,%rcx,4), %ecx
- movl (%rdx,%rax,4), %eax
-#endif
-
- sub %ecx, %eax
- ret
-
-LABEL(strcmp_exitz):
- xor %eax, %eax
- ret
-
- .p2align 4
- // XXX Same as code above
-LABEL(Byte0):
- movzx (%rsi), %ecx
- movzx (%rdi), %eax
-
-#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
- leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
- movl (%rdx,%rcx,4), %ecx
- movl (%rdx,%rax,4), %eax
-#endif
-
- sub %ecx, %eax
- ret
- cfi_endproc
- .size STRCMP_SSE42, .-STRCMP_SSE42
-
-#undef UCLOW_reg
-#undef UCHIGH_reg
-#undef LCQWORD_reg
-#undef TOLOWER
-
- /* Put all SSE 4.2 functions together. */
- .section .rodata.SECTION,"a",@progbits
- .p2align 3
-LABEL(unaligned_table):
- .int LABEL(ashr_1) - LABEL(unaligned_table)
- .int LABEL(ashr_2) - LABEL(unaligned_table)
- .int LABEL(ashr_3) - LABEL(unaligned_table)
- .int LABEL(ashr_4) - LABEL(unaligned_table)
- .int LABEL(ashr_5) - LABEL(unaligned_table)
- .int LABEL(ashr_6) - LABEL(unaligned_table)
- .int LABEL(ashr_7) - LABEL(unaligned_table)
- .int LABEL(ashr_8) - LABEL(unaligned_table)
- .int LABEL(ashr_9) - LABEL(unaligned_table)
- .int LABEL(ashr_10) - LABEL(unaligned_table)
- .int LABEL(ashr_11) - LABEL(unaligned_table)
- .int LABEL(ashr_12) - LABEL(unaligned_table)
- .int LABEL(ashr_13) - LABEL(unaligned_table)
- .int LABEL(ashr_14) - LABEL(unaligned_table)
- .int LABEL(ashr_15) - LABEL(unaligned_table)
- .int LABEL(ashr_0) - LABEL(unaligned_table)
-
-#undef LABEL
-#undef GLABEL
-#undef SECTION
-#undef movdqa
-#undef movdqu
-#undef pmovmskb
-#undef pcmpistri
-#undef psubb
-#undef pcmpeqb
-#undef psrldq
-#undef pslldq
-#undef palignr
-#undef pxor
-#undef D
diff --git a/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/sysdeps/x86_64/multiarch/strcmp-ssse3.S
deleted file mode 100644
index 1b7fa33c91..0000000000
--- a/sysdeps/x86_64/multiarch/strcmp-ssse3.S
+++ /dev/null
@@ -1,5 +0,0 @@
-#if IS_IN (libc)
-# define USE_SSSE3 1
-# define STRCMP __strcmp_ssse3
-# include "../strcmp.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
deleted file mode 100644
index 54f8f7dd44..0000000000
--- a/sysdeps/x86_64/multiarch/strcmp.S
+++ /dev/null
@@ -1,209 +0,0 @@
-/* Multiple versions of strcmp
- Copyright (C) 2009-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-#ifdef USE_AS_STRNCMP
-/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
- if the new counter > the old one or is 0. */
-# define UPDATE_STRNCMP_COUNTER \
- /* calculate left number to compare */ \
- lea -16(%rcx, %r11), %r9; \
- cmp %r9, %r11; \
- jb LABEL(strcmp_exitz); \
- test %r9, %r9; \
- je LABEL(strcmp_exitz); \
- mov %r9, %r11
-
-# define STRCMP_SSE42 __strncmp_sse42
-# define STRCMP_SSSE3 __strncmp_ssse3
-# define STRCMP_SSE2 __strncmp_sse2
-# define __GI_STRCMP __GI_strncmp
-#elif defined USE_AS_STRCASECMP_L
-# include "locale-defines.h"
-
-# define UPDATE_STRNCMP_COUNTER
-
-# define STRCMP_AVX __strcasecmp_l_avx
-# define STRCMP_SSE42 __strcasecmp_l_sse42
-# define STRCMP_SSSE3 __strcasecmp_l_ssse3
-# define STRCMP_SSE2 __strcasecmp_l_sse2
-# define __GI_STRCMP __GI___strcasecmp_l
-#elif defined USE_AS_STRNCASECMP_L
-# include "locale-defines.h"
-
-/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
- if the new counter > the old one or is 0. */
-# define UPDATE_STRNCMP_COUNTER \
- /* calculate left number to compare */ \
- lea -16(%rcx, %r11), %r9; \
- cmp %r9, %r11; \
- jb LABEL(strcmp_exitz); \
- test %r9, %r9; \
- je LABEL(strcmp_exitz); \
- mov %r9, %r11
-
-# define STRCMP_AVX __strncasecmp_l_avx
-# define STRCMP_SSE42 __strncasecmp_l_sse42
-# define STRCMP_SSSE3 __strncasecmp_l_ssse3
-# define STRCMP_SSE2 __strncasecmp_l_sse2
-# define __GI_STRCMP __GI___strncasecmp_l
-#else
-# define USE_AS_STRCMP
-# define UPDATE_STRNCMP_COUNTER
-# ifndef STRCMP
-# define STRCMP strcmp
-# define STRCMP_SSE42 __strcmp_sse42
-# define STRCMP_SSSE3 __strcmp_ssse3
-# define STRCMP_SSE2 __strcmp_sse2
-# define __GI_STRCMP __GI_strcmp
-# endif
-#endif
-
-/* Define multiple versions only for the definition in libc. Don't
- define multiple versions for strncmp in static library since we
- need strncmp before the initialization happened. */
-#if (defined SHARED || !defined USE_AS_STRNCMP) && IS_IN (libc)
- .text
-ENTRY(STRCMP)
- .type STRCMP, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
-#ifdef USE_AS_STRCMP
- leaq __strcmp_sse2_unaligned(%rip), %rax
- HAS_ARCH_FEATURE (Fast_Unaligned_Load)
- jnz 3f
-#else
- HAS_ARCH_FEATURE (Slow_SSE4_2)
- jnz 2f
- leaq STRCMP_SSE42(%rip), %rax
- HAS_CPU_FEATURE (SSE4_2)
- jnz 3f
-#endif
-2: leaq STRCMP_SSSE3(%rip), %rax
- HAS_CPU_FEATURE (SSSE3)
- jnz 3f
- leaq STRCMP_SSE2(%rip), %rax
-3: ret
-END(STRCMP)
-
-# ifdef USE_AS_STRCASECMP_L
-ENTRY(__strcasecmp)
- .type __strcasecmp, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- leaq __strcasecmp_avx(%rip), %rax
- HAS_ARCH_FEATURE (AVX_Usable)
- jnz 3f
- HAS_ARCH_FEATURE (Slow_SSE4_2)
- jnz 2f
- leaq __strcasecmp_sse42(%rip), %rax
- HAS_CPU_FEATURE (SSE4_2)
- jnz 3f
-2: leaq __strcasecmp_ssse3(%rip), %rax
- HAS_CPU_FEATURE (SSSE3)
- jnz 3f
- leaq __strcasecmp_sse2(%rip), %rax
-3: ret
-END(__strcasecmp)
-weak_alias (__strcasecmp, strcasecmp)
-# endif
-# ifdef USE_AS_STRNCASECMP_L
-ENTRY(__strncasecmp)
- .type __strncasecmp, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- leaq __strncasecmp_avx(%rip), %rax
- HAS_ARCH_FEATURE (AVX_Usable)
- jnz 3f
- HAS_ARCH_FEATURE (Slow_SSE4_2)
- jnz 2f
- leaq __strncasecmp_sse42(%rip), %rax
- HAS_CPU_FEATURE (SSE4_2)
- jnz 3f
-2: leaq __strncasecmp_ssse3(%rip), %rax
- HAS_CPU_FEATURE (SSSE3)
- jnz 3f
- leaq __strncasecmp_sse2(%rip), %rax
-3: ret
-END(__strncasecmp)
-weak_alias (__strncasecmp, strncasecmp)
-# endif
-
-# undef LABEL
-# define LABEL(l) .L##l##_sse42
-# define GLABEL(l) l##_sse42
-# define SECTION sse4.2
-# include "strcmp-sse42.S"
-
-
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
-# define LABEL(l) .L##l##_avx
-# define GLABEL(l) l##_avx
-# define USE_AVX 1
-# undef STRCMP_SSE42
-# define STRCMP_SSE42 STRCMP_AVX
-# define SECTION avx
-# include "strcmp-sse42.S"
-# endif
-
-
-# undef ENTRY
-# define ENTRY(name) \
- .type STRCMP_SSE2, @function; \
- .align 16; \
- .globl STRCMP_SSE2; \
- .hidden STRCMP_SSE2; \
- STRCMP_SSE2: cfi_startproc; \
- CALL_MCOUNT
-# undef END
-# define END(name) \
- cfi_endproc; .size STRCMP_SSE2, .-STRCMP_SSE2
-
-# ifdef USE_AS_STRCASECMP_L
-# define ENTRY2(name) \
- .type __strcasecmp_sse2, @function; \
- .align 16; \
- .globl __strcasecmp_sse2; \
- .hidden __strcasecmp_sse2; \
- __strcasecmp_sse2: cfi_startproc; \
- CALL_MCOUNT
-# define END2(name) \
- cfi_endproc; .size __strcasecmp_sse2, .-__strcasecmp_sse2
-# endif
-
-# ifdef USE_AS_STRNCASECMP_L
-# define ENTRY2(name) \
- .type __strncasecmp_sse2, @function; \
- .align 16; \
- .globl __strncasecmp_sse2; \
- .hidden __strncasecmp_sse2; \
- __strncasecmp_sse2: cfi_startproc; \
- CALL_MCOUNT
-# define END2(name) \
- cfi_endproc; .size __strncasecmp_sse2, .-__strncasecmp_sse2
-# endif
-
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal strcmp calls through a PLT.
- The speedup we get from using SSE4.2 instruction is likely eaten away
- by the indirect call in the PLT. */
-# define libc_hidden_builtin_def(name) \
- .globl __GI_STRCMP; __GI_STRCMP = STRCMP_SSE2
-#endif
-
-#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
deleted file mode 100644
index 6a5ab7ab26..0000000000
--- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
+++ /dev/null
@@ -1,1889 +0,0 @@
-/* strcpy with SSE2 and unaligned load
- Copyright (C) 2011-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# ifndef USE_AS_STRCAT
-# include <sysdep.h>
-
-# ifndef STRCPY
-# define STRCPY __strcpy_sse2_unaligned
-# endif
-
-# endif
-
-# define JMPTBL(I, B) I - B
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- lea TABLE(%rip), %r11; \
- movslq (%r11, INDEX, SCALE), %rcx; \
- lea (%r11, %rcx), %rcx; \
- jmp *%rcx
-
-# ifndef USE_AS_STRCAT
-
-.text
-ENTRY (STRCPY)
-# ifdef USE_AS_STRNCPY
- mov %rdx, %r8
- test %r8, %r8
- jz L(ExitZero)
-# endif
- mov %rsi, %rcx
-# ifndef USE_AS_STPCPY
- mov %rdi, %rax /* save result */
-# endif
-
-# endif
-
- and $63, %rcx
- cmp $32, %rcx
- jbe L(SourceStringAlignmentLess32)
-
- and $-16, %rsi
- and $15, %rcx
- pxor %xmm0, %xmm0
- pxor %xmm1, %xmm1
-
- pcmpeqb (%rsi), %xmm1
- pmovmskb %xmm1, %rdx
- shr %cl, %rdx
-
-# ifdef USE_AS_STRNCPY
-# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
- mov $16, %r10
- sub %rcx, %r10
- cmp %r10, %r8
-# else
- mov $17, %r10
- sub %rcx, %r10
- cmp %r10, %r8
-# endif
- jbe L(CopyFrom1To16BytesTailCase2OrCase3)
-# endif
- test %rdx, %rdx
- jnz L(CopyFrom1To16BytesTail)
-
- pcmpeqb 16(%rsi), %xmm0
- pmovmskb %xmm0, %rdx
-
-# ifdef USE_AS_STRNCPY
- add $16, %r10
- cmp %r10, %r8
- jbe L(CopyFrom1To32BytesCase2OrCase3)
-# endif
- test %rdx, %rdx
- jnz L(CopyFrom1To32Bytes)
-
- movdqu (%rsi, %rcx), %xmm1 /* copy 16 bytes */
- movdqu %xmm1, (%rdi)
-
-/* If source address alignment != destination address alignment */
- .p2align 4
-L(Unalign16Both):
- sub %rcx, %rdi
-# ifdef USE_AS_STRNCPY
- add %rcx, %r8
- sbb %rcx, %rcx
- or %rcx, %r8
-# endif
- mov $16, %rcx
- movdqa (%rsi, %rcx), %xmm1
- movaps 16(%rsi, %rcx), %xmm2
- movdqu %xmm1, (%rdi, %rcx)
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rdx
- add $16, %rcx
-# ifdef USE_AS_STRNCPY
- sub $48, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rdx, %rdx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyFrom1To16BytesUnalignedXmm2)
-# else
- jnz L(CopyFrom1To16Bytes)
-# endif
-
- movaps 16(%rsi, %rcx), %xmm3
- movdqu %xmm2, (%rdi, %rcx)
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %rdx
- add $16, %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rdx, %rdx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyFrom1To16BytesUnalignedXmm3)
-# else
- jnz L(CopyFrom1To16Bytes)
-# endif
-
- movaps 16(%rsi, %rcx), %xmm4
- movdqu %xmm3, (%rdi, %rcx)
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rdx
- add $16, %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rdx, %rdx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyFrom1To16BytesUnalignedXmm4)
-# else
- jnz L(CopyFrom1To16Bytes)
-# endif
-
- movaps 16(%rsi, %rcx), %xmm1
- movdqu %xmm4, (%rdi, %rcx)
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %rdx
- add $16, %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rdx, %rdx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyFrom1To16BytesUnalignedXmm1)
-# else
- jnz L(CopyFrom1To16Bytes)
-# endif
-
- movaps 16(%rsi, %rcx), %xmm2
- movdqu %xmm1, (%rdi, %rcx)
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rdx
- add $16, %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rdx, %rdx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyFrom1To16BytesUnalignedXmm2)
-# else
- jnz L(CopyFrom1To16Bytes)
-# endif
-
- movaps 16(%rsi, %rcx), %xmm3
- movdqu %xmm2, (%rdi, %rcx)
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %rdx
- add $16, %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rdx, %rdx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyFrom1To16BytesUnalignedXmm3)
-# else
- jnz L(CopyFrom1To16Bytes)
-# endif
-
- movdqu %xmm3, (%rdi, %rcx)
- mov %rsi, %rdx
- lea 16(%rsi, %rcx), %rsi
- and $-0x40, %rsi
- sub %rsi, %rdx
- sub %rdx, %rdi
-# ifdef USE_AS_STRNCPY
- lea 128(%r8, %rdx), %r8
-# endif
-L(Unaligned64Loop):
- movaps (%rsi), %xmm2
- movaps %xmm2, %xmm4
- movaps 16(%rsi), %xmm5
- movaps 32(%rsi), %xmm3
- movaps %xmm3, %xmm6
- movaps 48(%rsi), %xmm7
- pminub %xmm5, %xmm2
- pminub %xmm7, %xmm3
- pminub %xmm2, %xmm3
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %rdx
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(UnalignedLeaveCase2OrCase3)
-# endif
- test %rdx, %rdx
- jnz L(Unaligned64Leave)
-
-L(Unaligned64Loop_start):
- add $64, %rdi
- add $64, %rsi
- movdqu %xmm4, -64(%rdi)
- movaps (%rsi), %xmm2
- movdqa %xmm2, %xmm4
- movdqu %xmm5, -48(%rdi)
- movaps 16(%rsi), %xmm5
- pminub %xmm5, %xmm2
- movaps 32(%rsi), %xmm3
- movdqu %xmm6, -32(%rdi)
- movaps %xmm3, %xmm6
- movdqu %xmm7, -16(%rdi)
- movaps 48(%rsi), %xmm7
- pminub %xmm7, %xmm3
- pminub %xmm2, %xmm3
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %rdx
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(UnalignedLeaveCase2OrCase3)
-# endif
- test %rdx, %rdx
- jz L(Unaligned64Loop_start)
-
-L(Unaligned64Leave):
- pxor %xmm1, %xmm1
-
- pcmpeqb %xmm4, %xmm0
- pcmpeqb %xmm5, %xmm1
- pmovmskb %xmm0, %rdx
- pmovmskb %xmm1, %rcx
- test %rdx, %rdx
- jnz L(CopyFrom1To16BytesUnaligned_0)
- test %rcx, %rcx
- jnz L(CopyFrom1To16BytesUnaligned_16)
-
- pcmpeqb %xmm6, %xmm0
- pcmpeqb %xmm7, %xmm1
- pmovmskb %xmm0, %rdx
- pmovmskb %xmm1, %rcx
- test %rdx, %rdx
- jnz L(CopyFrom1To16BytesUnaligned_32)
-
- bsf %rcx, %rdx
- movdqu %xmm4, (%rdi)
- movdqu %xmm5, 16(%rdi)
- movdqu %xmm6, 32(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
- lea 48(%rdi, %rdx), %rax
-# endif
- movdqu %xmm7, 48(%rdi)
- add $15, %r8
- sub %rdx, %r8
- lea 49(%rdi, %rdx), %rdi
- jmp L(StrncpyFillTailWithZero)
-# else
- add $48, %rsi
- add $48, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
-# endif
-
-/* If source address alignment == destination address alignment */
-
-L(SourceStringAlignmentLess32):
- pxor %xmm0, %xmm0
- movdqu (%rsi), %xmm1
- movdqu 16(%rsi), %xmm2
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %rdx
-
-# ifdef USE_AS_STRNCPY
-# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
- cmp $16, %r8
-# else
- cmp $17, %r8
-# endif
- jbe L(CopyFrom1To16BytesTail1Case2OrCase3)
-# endif
- test %rdx, %rdx
- jnz L(CopyFrom1To16BytesTail1)
-
- pcmpeqb %xmm2, %xmm0
- movdqu %xmm1, (%rdi)
- pmovmskb %xmm0, %rdx
-
-# ifdef USE_AS_STRNCPY
-# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
- cmp $32, %r8
-# else
- cmp $33, %r8
-# endif
- jbe L(CopyFrom1To32Bytes1Case2OrCase3)
-# endif
- test %rdx, %rdx
- jnz L(CopyFrom1To32Bytes1)
-
- and $-16, %rsi
- and $15, %rcx
- jmp L(Unalign16Both)
-
-/*------End of main part with loops---------------------*/
-
-/* Case1 */
-
-# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
- .p2align 4
-L(CopyFrom1To16Bytes):
- add %rcx, %rdi
- add %rcx, %rsi
- bsf %rdx, %rdx
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
-# endif
- .p2align 4
-L(CopyFrom1To16BytesTail):
- add %rcx, %rsi
- bsf %rdx, %rdx
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
-
- .p2align 4
-L(CopyFrom1To32Bytes1):
- add $16, %rsi
- add $16, %rdi
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $16, %r8
-# endif
-L(CopyFrom1To16BytesTail1):
- bsf %rdx, %rdx
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
-
- .p2align 4
-L(CopyFrom1To32Bytes):
- bsf %rdx, %rdx
- add %rcx, %rsi
- add $16, %rdx
- sub %rcx, %rdx
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
-
- .p2align 4
-L(CopyFrom1To16BytesUnaligned_0):
- bsf %rdx, %rdx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
- lea (%rdi, %rdx), %rax
-# endif
- movdqu %xmm4, (%rdi)
- add $63, %r8
- sub %rdx, %r8
- lea 1(%rdi, %rdx), %rdi
- jmp L(StrncpyFillTailWithZero)
-# else
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
-# endif
-
- .p2align 4
-L(CopyFrom1To16BytesUnaligned_16):
- bsf %rcx, %rdx
- movdqu %xmm4, (%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
- lea 16(%rdi, %rdx), %rax
-# endif
- movdqu %xmm5, 16(%rdi)
- add $47, %r8
- sub %rdx, %r8
- lea 17(%rdi, %rdx), %rdi
- jmp L(StrncpyFillTailWithZero)
-# else
- add $16, %rsi
- add $16, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
-# endif
-
- .p2align 4
-L(CopyFrom1To16BytesUnaligned_32):
- bsf %rdx, %rdx
- movdqu %xmm4, (%rdi)
- movdqu %xmm5, 16(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
- lea 32(%rdi, %rdx), %rax
-# endif
- movdqu %xmm6, 32(%rdi)
- add $31, %r8
- sub %rdx, %r8
- lea 33(%rdi, %rdx), %rdi
- jmp L(StrncpyFillTailWithZero)
-# else
- add $32, %rsi
- add $32, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
-# endif
-
-# ifdef USE_AS_STRNCPY
-# ifndef USE_AS_STRCAT
- .p2align 4
-L(CopyFrom1To16BytesUnalignedXmm6):
- movdqu %xmm6, (%rdi, %rcx)
- jmp L(CopyFrom1To16BytesXmmExit)
-
- .p2align 4
-L(CopyFrom1To16BytesUnalignedXmm5):
- movdqu %xmm5, (%rdi, %rcx)
- jmp L(CopyFrom1To16BytesXmmExit)
-
- .p2align 4
-L(CopyFrom1To16BytesUnalignedXmm4):
- movdqu %xmm4, (%rdi, %rcx)
- jmp L(CopyFrom1To16BytesXmmExit)
-
- .p2align 4
-L(CopyFrom1To16BytesUnalignedXmm3):
- movdqu %xmm3, (%rdi, %rcx)
- jmp L(CopyFrom1To16BytesXmmExit)
-
- .p2align 4
-L(CopyFrom1To16BytesUnalignedXmm1):
- movdqu %xmm1, (%rdi, %rcx)
- jmp L(CopyFrom1To16BytesXmmExit)
-# endif
-
- .p2align 4
-L(CopyFrom1To16BytesExit):
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
-
-/* Case2 */
-
- .p2align 4
-L(CopyFrom1To16BytesCase2):
- add $16, %r8
- add %rcx, %rdi
- add %rcx, %rsi
- bsf %rdx, %rdx
- cmp %r8, %rdx
- jb L(CopyFrom1To16BytesExit)
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
-
- .p2align 4
-L(CopyFrom1To32BytesCase2):
- add %rcx, %rsi
- bsf %rdx, %rdx
- add $16, %rdx
- sub %rcx, %rdx
- cmp %r8, %rdx
- jb L(CopyFrom1To16BytesExit)
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
-
-L(CopyFrom1To16BytesTailCase2):
- add %rcx, %rsi
- bsf %rdx, %rdx
- cmp %r8, %rdx
- jb L(CopyFrom1To16BytesExit)
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
-
-L(CopyFrom1To16BytesTail1Case2):
- bsf %rdx, %rdx
- cmp %r8, %rdx
- jb L(CopyFrom1To16BytesExit)
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
-
-/* Case2 or Case3, Case3 */
-
- .p2align 4
-L(CopyFrom1To16BytesCase2OrCase3):
- test %rdx, %rdx
- jnz L(CopyFrom1To16BytesCase2)
-L(CopyFrom1To16BytesCase3):
- add $16, %r8
- add %rcx, %rdi
- add %rcx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
-
- .p2align 4
-L(CopyFrom1To32BytesCase2OrCase3):
- test %rdx, %rdx
- jnz L(CopyFrom1To32BytesCase2)
- add %rcx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
-
- .p2align 4
-L(CopyFrom1To16BytesTailCase2OrCase3):
- test %rdx, %rdx
- jnz L(CopyFrom1To16BytesTailCase2)
- add %rcx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
-
- .p2align 4
-L(CopyFrom1To32Bytes1Case2OrCase3):
- add $16, %rdi
- add $16, %rsi
- sub $16, %r8
-L(CopyFrom1To16BytesTail1Case2OrCase3):
- test %rdx, %rdx
- jnz L(CopyFrom1To16BytesTail1Case2)
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
-
-# endif
-
-/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/
-
- .p2align 4
-L(Exit1):
- mov %dh, (%rdi)
-# ifdef USE_AS_STPCPY
- lea (%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $1, %r8
- lea 1(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit2):
- mov (%rsi), %dx
- mov %dx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 1(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $2, %r8
- lea 2(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit3):
- mov (%rsi), %cx
- mov %cx, (%rdi)
- mov %dh, 2(%rdi)
-# ifdef USE_AS_STPCPY
- lea 2(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $3, %r8
- lea 3(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit4):
- mov (%rsi), %edx
- mov %edx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 3(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $4, %r8
- lea 4(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit5):
- mov (%rsi), %ecx
- mov %dh, 4(%rdi)
- mov %ecx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 4(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $5, %r8
- lea 5(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit6):
- mov (%rsi), %ecx
- mov 4(%rsi), %dx
- mov %ecx, (%rdi)
- mov %dx, 4(%rdi)
-# ifdef USE_AS_STPCPY
- lea 5(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $6, %r8
- lea 6(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit7):
- mov (%rsi), %ecx
- mov 3(%rsi), %edx
- mov %ecx, (%rdi)
- mov %edx, 3(%rdi)
-# ifdef USE_AS_STPCPY
- lea 6(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $7, %r8
- lea 7(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit8):
- mov (%rsi), %rdx
- mov %rdx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 7(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $8, %r8
- lea 8(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit9):
- mov (%rsi), %rcx
- mov %dh, 8(%rdi)
- mov %rcx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 8(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $9, %r8
- lea 9(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit10):
- mov (%rsi), %rcx
- mov 8(%rsi), %dx
- mov %rcx, (%rdi)
- mov %dx, 8(%rdi)
-# ifdef USE_AS_STPCPY
- lea 9(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $10, %r8
- lea 10(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit11):
- mov (%rsi), %rcx
- mov 7(%rsi), %edx
- mov %rcx, (%rdi)
- mov %edx, 7(%rdi)
-# ifdef USE_AS_STPCPY
- lea 10(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $11, %r8
- lea 11(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit12):
- mov (%rsi), %rcx
- mov 8(%rsi), %edx
- mov %rcx, (%rdi)
- mov %edx, 8(%rdi)
-# ifdef USE_AS_STPCPY
- lea 11(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $12, %r8
- lea 12(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit13):
- mov (%rsi), %rcx
- mov 5(%rsi), %rdx
- mov %rcx, (%rdi)
- mov %rdx, 5(%rdi)
-# ifdef USE_AS_STPCPY
- lea 12(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $13, %r8
- lea 13(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit14):
- mov (%rsi), %rcx
- mov 6(%rsi), %rdx
- mov %rcx, (%rdi)
- mov %rdx, 6(%rdi)
-# ifdef USE_AS_STPCPY
- lea 13(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $14, %r8
- lea 14(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit15):
- mov (%rsi), %rcx
- mov 7(%rsi), %rdx
- mov %rcx, (%rdi)
- mov %rdx, 7(%rdi)
-# ifdef USE_AS_STPCPY
- lea 14(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $15, %r8
- lea 15(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit16):
- movdqu (%rsi), %xmm0
- movdqu %xmm0, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 15(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $16, %r8
- lea 16(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit17):
- movdqu (%rsi), %xmm0
- movdqu %xmm0, (%rdi)
- mov %dh, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 16(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $17, %r8
- lea 17(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit18):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %cx
- movdqu %xmm0, (%rdi)
- mov %cx, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 17(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $18, %r8
- lea 18(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit19):
- movdqu (%rsi), %xmm0
- mov 15(%rsi), %ecx
- movdqu %xmm0, (%rdi)
- mov %ecx, 15(%rdi)
-# ifdef USE_AS_STPCPY
- lea 18(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $19, %r8
- lea 19(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit20):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %ecx
- movdqu %xmm0, (%rdi)
- mov %ecx, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 19(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $20, %r8
- lea 20(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit21):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %ecx
- movdqu %xmm0, (%rdi)
- mov %ecx, 16(%rdi)
- mov %dh, 20(%rdi)
-# ifdef USE_AS_STPCPY
- lea 20(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $21, %r8
- lea 21(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit22):
- movdqu (%rsi), %xmm0
- mov 14(%rsi), %rcx
- movdqu %xmm0, (%rdi)
- mov %rcx, 14(%rdi)
-# ifdef USE_AS_STPCPY
- lea 21(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $22, %r8
- lea 22(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit23):
- movdqu (%rsi), %xmm0
- mov 15(%rsi), %rcx
- movdqu %xmm0, (%rdi)
- mov %rcx, 15(%rdi)
-# ifdef USE_AS_STPCPY
- lea 22(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $23, %r8
- lea 23(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit24):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rcx
- movdqu %xmm0, (%rdi)
- mov %rcx, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 23(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $24, %r8
- lea 24(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit25):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rcx
- movdqu %xmm0, (%rdi)
- mov %rcx, 16(%rdi)
- mov %dh, 24(%rdi)
-# ifdef USE_AS_STPCPY
- lea 24(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $25, %r8
- lea 25(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit26):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 24(%rsi), %cx
- movdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %cx, 24(%rdi)
-# ifdef USE_AS_STPCPY
- lea 25(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $26, %r8
- lea 26(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit27):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 23(%rsi), %ecx
- movdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %ecx, 23(%rdi)
-# ifdef USE_AS_STPCPY
- lea 26(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $27, %r8
- lea 27(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit28):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 24(%rsi), %ecx
- movdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %ecx, 24(%rdi)
-# ifdef USE_AS_STPCPY
- lea 27(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $28, %r8
- lea 28(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit29):
- movdqu (%rsi), %xmm0
- movdqu 13(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 13(%rdi)
-# ifdef USE_AS_STPCPY
- lea 28(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $29, %r8
- lea 29(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit30):
- movdqu (%rsi), %xmm0
- movdqu 14(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 14(%rdi)
-# ifdef USE_AS_STPCPY
- lea 29(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $30, %r8
- lea 30(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit31):
- movdqu (%rsi), %xmm0
- movdqu 15(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 15(%rdi)
-# ifdef USE_AS_STPCPY
- lea 30(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $31, %r8
- lea 31(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit32):
- movdqu (%rsi), %xmm0
- movdqu 16(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 31(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $32, %r8
- lea 32(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
-# ifdef USE_AS_STRNCPY
-
- .p2align 4
-L(StrncpyExit0):
-# ifdef USE_AS_STPCPY
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, (%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit1):
- mov (%rsi), %dl
- mov %dl, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 1(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 1(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit2):
- mov (%rsi), %dx
- mov %dx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 2(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 2(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit3):
- mov (%rsi), %cx
- mov 2(%rsi), %dl
- mov %cx, (%rdi)
- mov %dl, 2(%rdi)
-# ifdef USE_AS_STPCPY
- lea 3(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 3(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit4):
- mov (%rsi), %edx
- mov %edx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 4(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 4(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit5):
- mov (%rsi), %ecx
- mov 4(%rsi), %dl
- mov %ecx, (%rdi)
- mov %dl, 4(%rdi)
-# ifdef USE_AS_STPCPY
- lea 5(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 5(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit6):
- mov (%rsi), %ecx
- mov 4(%rsi), %dx
- mov %ecx, (%rdi)
- mov %dx, 4(%rdi)
-# ifdef USE_AS_STPCPY
- lea 6(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 6(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit7):
- mov (%rsi), %ecx
- mov 3(%rsi), %edx
- mov %ecx, (%rdi)
- mov %edx, 3(%rdi)
-# ifdef USE_AS_STPCPY
- lea 7(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 7(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit8):
- mov (%rsi), %rdx
- mov %rdx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 8(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 8(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit9):
- mov (%rsi), %rcx
- mov 8(%rsi), %dl
- mov %rcx, (%rdi)
- mov %dl, 8(%rdi)
-# ifdef USE_AS_STPCPY
- lea 9(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 9(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit10):
- mov (%rsi), %rcx
- mov 8(%rsi), %dx
- mov %rcx, (%rdi)
- mov %dx, 8(%rdi)
-# ifdef USE_AS_STPCPY
- lea 10(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 10(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit11):
- mov (%rsi), %rcx
- mov 7(%rsi), %edx
- mov %rcx, (%rdi)
- mov %edx, 7(%rdi)
-# ifdef USE_AS_STPCPY
- lea 11(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 11(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit12):
- mov (%rsi), %rcx
- mov 8(%rsi), %edx
- mov %rcx, (%rdi)
- mov %edx, 8(%rdi)
-# ifdef USE_AS_STPCPY
- lea 12(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 12(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit13):
- mov (%rsi), %rcx
- mov 5(%rsi), %rdx
- mov %rcx, (%rdi)
- mov %rdx, 5(%rdi)
-# ifdef USE_AS_STPCPY
- lea 13(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 13(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit14):
- mov (%rsi), %rcx
- mov 6(%rsi), %rdx
- mov %rcx, (%rdi)
- mov %rdx, 6(%rdi)
-# ifdef USE_AS_STPCPY
- lea 14(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 14(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit15):
- mov (%rsi), %rcx
- mov 7(%rsi), %rdx
- mov %rcx, (%rdi)
- mov %rdx, 7(%rdi)
-# ifdef USE_AS_STPCPY
- lea 15(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 15(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit16):
- movdqu (%rsi), %xmm0
- movdqu %xmm0, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 16(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 16(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit17):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %cl
- movdqu %xmm0, (%rdi)
- mov %cl, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 17(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 17(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit18):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %cx
- movdqu %xmm0, (%rdi)
- mov %cx, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 18(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 18(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit19):
- movdqu (%rsi), %xmm0
- mov 15(%rsi), %ecx
- movdqu %xmm0, (%rdi)
- mov %ecx, 15(%rdi)
-# ifdef USE_AS_STPCPY
- lea 19(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 19(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit20):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %ecx
- movdqu %xmm0, (%rdi)
- mov %ecx, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 20(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 20(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit21):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %ecx
- mov 20(%rsi), %dl
- movdqu %xmm0, (%rdi)
- mov %ecx, 16(%rdi)
- mov %dl, 20(%rdi)
-# ifdef USE_AS_STPCPY
- lea 21(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 21(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit22):
- movdqu (%rsi), %xmm0
- mov 14(%rsi), %rcx
- movdqu %xmm0, (%rdi)
- mov %rcx, 14(%rdi)
-# ifdef USE_AS_STPCPY
- lea 22(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 22(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit23):
- movdqu (%rsi), %xmm0
- mov 15(%rsi), %rcx
- movdqu %xmm0, (%rdi)
- mov %rcx, 15(%rdi)
-# ifdef USE_AS_STPCPY
- lea 23(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 23(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit24):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rcx
- movdqu %xmm0, (%rdi)
- mov %rcx, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 24(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 24(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit25):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 24(%rsi), %cl
- movdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %cl, 24(%rdi)
-# ifdef USE_AS_STPCPY
- lea 25(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 25(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit26):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 24(%rsi), %cx
- movdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %cx, 24(%rdi)
-# ifdef USE_AS_STPCPY
- lea 26(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 26(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit27):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 23(%rsi), %ecx
- movdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %ecx, 23(%rdi)
-# ifdef USE_AS_STPCPY
- lea 27(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 27(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit28):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 24(%rsi), %ecx
- movdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %ecx, 24(%rdi)
-# ifdef USE_AS_STPCPY
- lea 28(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 28(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit29):
- movdqu (%rsi), %xmm0
- movdqu 13(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 13(%rdi)
-# ifdef USE_AS_STPCPY
- lea 29(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 29(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit30):
- movdqu (%rsi), %xmm0
- movdqu 14(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 14(%rdi)
-# ifdef USE_AS_STPCPY
- lea 30(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 30(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit31):
- movdqu (%rsi), %xmm0
- movdqu 15(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 15(%rdi)
-# ifdef USE_AS_STPCPY
- lea 31(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 31(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit32):
- movdqu (%rsi), %xmm0
- movdqu 16(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 32(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 32(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit33):
- movdqu (%rsi), %xmm0
- movdqu 16(%rsi), %xmm2
- mov 32(%rsi), %cl
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 16(%rdi)
- mov %cl, 32(%rdi)
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 33(%rdi)
-# endif
- ret
-
-# ifndef USE_AS_STRCAT
-
- .p2align 4
-L(Fill0):
- ret
-
- .p2align 4
-L(Fill1):
- mov %dl, (%rdi)
- ret
-
- .p2align 4
-L(Fill2):
- mov %dx, (%rdi)
- ret
-
- .p2align 4
-L(Fill3):
- mov %edx, -1(%rdi)
- ret
-
- .p2align 4
-L(Fill4):
- mov %edx, (%rdi)
- ret
-
- .p2align 4
-L(Fill5):
- mov %edx, (%rdi)
- mov %dl, 4(%rdi)
- ret
-
- .p2align 4
-L(Fill6):
- mov %edx, (%rdi)
- mov %dx, 4(%rdi)
- ret
-
- .p2align 4
-L(Fill7):
- mov %rdx, -1(%rdi)
- ret
-
- .p2align 4
-L(Fill8):
- mov %rdx, (%rdi)
- ret
-
- .p2align 4
-L(Fill9):
- mov %rdx, (%rdi)
- mov %dl, 8(%rdi)
- ret
-
- .p2align 4
-L(Fill10):
- mov %rdx, (%rdi)
- mov %dx, 8(%rdi)
- ret
-
- .p2align 4
-L(Fill11):
- mov %rdx, (%rdi)
- mov %edx, 7(%rdi)
- ret
-
- .p2align 4
-L(Fill12):
- mov %rdx, (%rdi)
- mov %edx, 8(%rdi)
- ret
-
- .p2align 4
-L(Fill13):
- mov %rdx, (%rdi)
- mov %rdx, 5(%rdi)
- ret
-
- .p2align 4
-L(Fill14):
- mov %rdx, (%rdi)
- mov %rdx, 6(%rdi)
- ret
-
- .p2align 4
-L(Fill15):
- movdqu %xmm0, -1(%rdi)
- ret
-
- .p2align 4
-L(Fill16):
- movdqu %xmm0, (%rdi)
- ret
-
- .p2align 4
-L(CopyFrom1To16BytesUnalignedXmm2):
- movdqu %xmm2, (%rdi, %rcx)
-
- .p2align 4
-L(CopyFrom1To16BytesXmmExit):
- bsf %rdx, %rdx
- add $15, %r8
- add %rcx, %rdi
-# ifdef USE_AS_STPCPY
- lea (%rdi, %rdx), %rax
-# endif
- sub %rdx, %r8
- lea 1(%rdi, %rdx), %rdi
-
- .p2align 4
-L(StrncpyFillTailWithZero):
- pxor %xmm0, %xmm0
- xor %rdx, %rdx
- sub $16, %r8
- jbe L(StrncpyFillExit)
-
- movdqu %xmm0, (%rdi)
- add $16, %rdi
-
- mov %rdi, %rsi
- and $0xf, %rsi
- sub %rsi, %rdi
- add %rsi, %r8
- sub $64, %r8
- jb L(StrncpyFillLess64)
-
-L(StrncpyFillLoopMovdqa):
- movdqa %xmm0, (%rdi)
- movdqa %xmm0, 16(%rdi)
- movdqa %xmm0, 32(%rdi)
- movdqa %xmm0, 48(%rdi)
- add $64, %rdi
- sub $64, %r8
- jae L(StrncpyFillLoopMovdqa)
-
-L(StrncpyFillLess64):
- add $32, %r8
- jl L(StrncpyFillLess32)
- movdqa %xmm0, (%rdi)
- movdqa %xmm0, 16(%rdi)
- add $32, %rdi
- sub $16, %r8
- jl L(StrncpyFillExit)
- movdqa %xmm0, (%rdi)
- add $16, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
-
-L(StrncpyFillLess32):
- add $16, %r8
- jl L(StrncpyFillExit)
- movdqa %xmm0, (%rdi)
- add $16, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
-
-L(StrncpyFillExit):
- add $16, %r8
- BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
-
-/* end of ifndef USE_AS_STRCAT */
-# endif
-
- .p2align 4
-L(UnalignedLeaveCase2OrCase3):
- test %rdx, %rdx
- jnz L(Unaligned64LeaveCase2)
-L(Unaligned64LeaveCase3):
- lea 64(%r8), %rcx
- and $-16, %rcx
- add $48, %r8
- jl L(CopyFrom1To16BytesCase3)
- movdqu %xmm4, (%rdi)
- sub $16, %r8
- jb L(CopyFrom1To16BytesCase3)
- movdqu %xmm5, 16(%rdi)
- sub $16, %r8
- jb L(CopyFrom1To16BytesCase3)
- movdqu %xmm6, 32(%rdi)
- sub $16, %r8
- jb L(CopyFrom1To16BytesCase3)
- movdqu %xmm7, 48(%rdi)
-# ifdef USE_AS_STPCPY
- lea 64(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 64(%rdi)
-# endif
- ret
-
- .p2align 4
-L(Unaligned64LeaveCase2):
- xor %rcx, %rcx
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rdx
- add $48, %r8
- jle L(CopyFrom1To16BytesCase2OrCase3)
- test %rdx, %rdx
-# ifndef USE_AS_STRCAT
- jnz L(CopyFrom1To16BytesUnalignedXmm4)
-# else
- jnz L(CopyFrom1To16Bytes)
-# endif
- pcmpeqb %xmm5, %xmm0
- pmovmskb %xmm0, %rdx
- movdqu %xmm4, (%rdi)
- add $16, %rcx
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
- test %rdx, %rdx
-# ifndef USE_AS_STRCAT
- jnz L(CopyFrom1To16BytesUnalignedXmm5)
-# else
- jnz L(CopyFrom1To16Bytes)
-# endif
-
- pcmpeqb %xmm6, %xmm0
- pmovmskb %xmm0, %rdx
- movdqu %xmm5, 16(%rdi)
- add $16, %rcx
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
- test %rdx, %rdx
-# ifndef USE_AS_STRCAT
- jnz L(CopyFrom1To16BytesUnalignedXmm6)
-# else
- jnz L(CopyFrom1To16Bytes)
-# endif
-
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm0, %rdx
- movdqu %xmm6, 32(%rdi)
- lea 16(%rdi, %rcx), %rdi
- lea 16(%rsi, %rcx), %rsi
- bsf %rdx, %rdx
- cmp %r8, %rdx
- jb L(CopyFrom1To16BytesExit)
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
-
- .p2align 4
-L(ExitZero):
-# ifndef USE_AS_STRCAT
- mov %rdi, %rax
-# endif
- ret
-
-# endif
-
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# else
-END (STRCAT)
-# endif
- .p2align 4
- .section .rodata
-L(ExitTable):
- .int JMPTBL(L(Exit1), L(ExitTable))
- .int JMPTBL(L(Exit2), L(ExitTable))
- .int JMPTBL(L(Exit3), L(ExitTable))
- .int JMPTBL(L(Exit4), L(ExitTable))
- .int JMPTBL(L(Exit5), L(ExitTable))
- .int JMPTBL(L(Exit6), L(ExitTable))
- .int JMPTBL(L(Exit7), L(ExitTable))
- .int JMPTBL(L(Exit8), L(ExitTable))
- .int JMPTBL(L(Exit9), L(ExitTable))
- .int JMPTBL(L(Exit10), L(ExitTable))
- .int JMPTBL(L(Exit11), L(ExitTable))
- .int JMPTBL(L(Exit12), L(ExitTable))
- .int JMPTBL(L(Exit13), L(ExitTable))
- .int JMPTBL(L(Exit14), L(ExitTable))
- .int JMPTBL(L(Exit15), L(ExitTable))
- .int JMPTBL(L(Exit16), L(ExitTable))
- .int JMPTBL(L(Exit17), L(ExitTable))
- .int JMPTBL(L(Exit18), L(ExitTable))
- .int JMPTBL(L(Exit19), L(ExitTable))
- .int JMPTBL(L(Exit20), L(ExitTable))
- .int JMPTBL(L(Exit21), L(ExitTable))
- .int JMPTBL(L(Exit22), L(ExitTable))
- .int JMPTBL(L(Exit23), L(ExitTable))
- .int JMPTBL(L(Exit24), L(ExitTable))
- .int JMPTBL(L(Exit25), L(ExitTable))
- .int JMPTBL(L(Exit26), L(ExitTable))
- .int JMPTBL(L(Exit27), L(ExitTable))
- .int JMPTBL(L(Exit28), L(ExitTable))
- .int JMPTBL(L(Exit29), L(ExitTable))
- .int JMPTBL(L(Exit30), L(ExitTable))
- .int JMPTBL(L(Exit31), L(ExitTable))
- .int JMPTBL(L(Exit32), L(ExitTable))
-# ifdef USE_AS_STRNCPY
-L(ExitStrncpyTable):
- .int JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
-# ifndef USE_AS_STRCAT
- .p2align 4
-L(FillTable):
- .int JMPTBL(L(Fill0), L(FillTable))
- .int JMPTBL(L(Fill1), L(FillTable))
- .int JMPTBL(L(Fill2), L(FillTable))
- .int JMPTBL(L(Fill3), L(FillTable))
- .int JMPTBL(L(Fill4), L(FillTable))
- .int JMPTBL(L(Fill5), L(FillTable))
- .int JMPTBL(L(Fill6), L(FillTable))
- .int JMPTBL(L(Fill7), L(FillTable))
- .int JMPTBL(L(Fill8), L(FillTable))
- .int JMPTBL(L(Fill9), L(FillTable))
- .int JMPTBL(L(Fill10), L(FillTable))
- .int JMPTBL(L(Fill11), L(FillTable))
- .int JMPTBL(L(Fill12), L(FillTable))
- .int JMPTBL(L(Fill13), L(FillTable))
- .int JMPTBL(L(Fill14), L(FillTable))
- .int JMPTBL(L(Fill15), L(FillTable))
- .int JMPTBL(L(Fill16), L(FillTable))
-# endif
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
deleted file mode 100644
index 47aaeae671..0000000000
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ /dev/null
@@ -1,3551 +0,0 @@
-/* strcpy with SSSE3
- Copyright (C) 2011-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# ifndef USE_AS_STRCAT
-# include <sysdep.h>
-
-# ifndef STRCPY
-# define STRCPY __strcpy_ssse3
-# endif
-
- .section .text.ssse3,"ax",@progbits
-ENTRY (STRCPY)
-
- mov %rsi, %rcx
-# ifdef USE_AS_STRNCPY
- mov %rdx, %r8
-# endif
- mov %rdi, %rdx
-# ifdef USE_AS_STRNCPY
- test %r8, %r8
- jz L(Exit0)
- cmp $8, %r8
- jbe L(StrncpyExit8Bytes)
-# endif
- cmpb $0, (%rcx)
- jz L(Exit1)
- cmpb $0, 1(%rcx)
- jz L(Exit2)
- cmpb $0, 2(%rcx)
- jz L(Exit3)
- cmpb $0, 3(%rcx)
- jz L(Exit4)
- cmpb $0, 4(%rcx)
- jz L(Exit5)
- cmpb $0, 5(%rcx)
- jz L(Exit6)
- cmpb $0, 6(%rcx)
- jz L(Exit7)
- cmpb $0, 7(%rcx)
- jz L(Exit8)
-# ifdef USE_AS_STRNCPY
- cmp $16, %r8
- jb L(StrncpyExit15Bytes)
-# endif
- cmpb $0, 8(%rcx)
- jz L(Exit9)
- cmpb $0, 9(%rcx)
- jz L(Exit10)
- cmpb $0, 10(%rcx)
- jz L(Exit11)
- cmpb $0, 11(%rcx)
- jz L(Exit12)
- cmpb $0, 12(%rcx)
- jz L(Exit13)
- cmpb $0, 13(%rcx)
- jz L(Exit14)
- cmpb $0, 14(%rcx)
- jz L(Exit15)
-# ifdef USE_AS_STRNCPY
- cmp $16, %r8
- je L(Exit16)
-# endif
- cmpb $0, 15(%rcx)
- jz L(Exit16)
-# endif
-
-# ifdef USE_AS_STRNCPY
- mov %rcx, %rsi
- sub $16, %r8
- and $0xf, %rsi
-
-/* add 16 bytes rcx_offset to r8 */
-
- add %rsi, %r8
-# endif
- lea 16(%rcx), %rsi
- and $-16, %rsi
- pxor %xmm0, %xmm0
- mov (%rcx), %r9
- mov %r9, (%rdx)
- pcmpeqb (%rsi), %xmm0
- mov 8(%rcx), %r9
- mov %r9, 8(%rdx)
-
-/* convert byte mask in xmm0 to bit mask */
-
- pmovmskb %xmm0, %rax
- sub %rcx, %rsi
-
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- mov %rdx, %rax
- lea 16(%rdx), %rdx
- and $-16, %rdx
- sub %rdx, %rax
-
-# ifdef USE_AS_STRNCPY
- add %rax, %rsi
- lea -1(%rsi), %rsi
- and $1<<31, %esi
- test %rsi, %rsi
- jnz L(ContinueCopy)
- lea 16(%r8), %r8
-
-L(ContinueCopy):
-# endif
- sub %rax, %rcx
- mov %rcx, %rax
- and $0xf, %rax
- mov $0, %rsi
-
-/* case: rcx_offset == rdx_offset */
-
- jz L(Align16Both)
-
- cmp $8, %rax
- jae L(ShlHigh8)
- cmp $1, %rax
- je L(Shl1)
- cmp $2, %rax
- je L(Shl2)
- cmp $3, %rax
- je L(Shl3)
- cmp $4, %rax
- je L(Shl4)
- cmp $5, %rax
- je L(Shl5)
- cmp $6, %rax
- je L(Shl6)
- jmp L(Shl7)
-
-L(ShlHigh8):
- je L(Shl8)
- cmp $9, %rax
- je L(Shl9)
- cmp $10, %rax
- je L(Shl10)
- cmp $11, %rax
- je L(Shl11)
- cmp $12, %rax
- je L(Shl12)
- cmp $13, %rax
- je L(Shl13)
- cmp $14, %rax
- je L(Shl14)
- jmp L(Shl15)
-
-L(Align16Both):
- movaps (%rcx), %xmm1
- movaps 16(%rcx), %xmm2
- movaps %xmm1, (%rdx)
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm3
- movaps %xmm2, (%rdx, %rsi)
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm4
- movaps %xmm3, (%rdx, %rsi)
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm1
- movaps %xmm4, (%rdx, %rsi)
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm2
- movaps %xmm1, (%rdx, %rsi)
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm3
- movaps %xmm2, (%rdx, %rsi)
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps %xmm3, (%rdx, %rsi)
- mov %rcx, %rax
- lea 16(%rcx, %rsi), %rcx
- and $-0x40, %rcx
- sub %rcx, %rax
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- lea 112(%r8, %rax), %r8
-# endif
- mov $-0x40, %rsi
-
- .p2align 4
-L(Aligned64Loop):
- movaps (%rcx), %xmm2
- movaps %xmm2, %xmm4
- movaps 16(%rcx), %xmm5
- movaps 32(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 48(%rcx), %xmm7
- pminub %xmm5, %xmm2
- pminub %xmm7, %xmm3
- pminub %xmm2, %xmm3
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %rax
- lea 64(%rdx), %rdx
- lea 64(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeaveCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Aligned64Leave)
- movaps %xmm4, -64(%rdx)
- movaps %xmm5, -48(%rdx)
- movaps %xmm6, -32(%rdx)
- movaps %xmm7, -16(%rdx)
- jmp L(Aligned64Loop)
-
-L(Aligned64Leave):
-# ifdef USE_AS_STRNCPY
- lea 48(%r8), %r8
-# endif
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm5, %xmm0
-# ifdef USE_AS_STRNCPY
- lea -16(%r8), %r8
-# endif
- pmovmskb %xmm0, %rax
- movaps %xmm4, -64(%rdx)
- test %rax, %rax
- lea 16(%rsi), %rsi
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm6, %xmm0
-# ifdef USE_AS_STRNCPY
- lea -16(%r8), %r8
-# endif
- pmovmskb %xmm0, %rax
- movaps %xmm5, -48(%rdx)
- test %rax, %rax
- lea 16(%rsi), %rsi
- jnz L(CopyFrom1To16Bytes)
-
- movaps %xmm6, -32(%rdx)
- pcmpeqb %xmm7, %xmm0
-# ifdef USE_AS_STRNCPY
- lea -16(%r8), %r8
-# endif
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl1):
- movaps -1(%rcx), %xmm1
- movaps 15(%rcx), %xmm2
-L(Shl1Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 31(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -15(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -1(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl1LoopStart):
- movaps 15(%rcx), %xmm2
- movaps 31(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 47(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 63(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $1, %xmm4, %xmm5
- test %rax, %rax
- palignr $1, %xmm3, %xmm4
- jnz L(Shl1Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave1)
-# endif
- palignr $1, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $1, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl1LoopStart)
-
-L(Shl1LoopExit):
- movdqu -1(%rcx), %xmm1
- mov $15, %rsi
- movdqu %xmm1, -1(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl2):
- movaps -2(%rcx), %xmm1
- movaps 14(%rcx), %xmm2
-L(Shl2Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 30(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -14(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -2(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl2LoopStart):
- movaps 14(%rcx), %xmm2
- movaps 30(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 46(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 62(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $2, %xmm4, %xmm5
- test %rax, %rax
- palignr $2, %xmm3, %xmm4
- jnz L(Shl2Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave2)
-# endif
- palignr $2, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $2, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl2LoopStart)
-
-L(Shl2LoopExit):
- movdqu -2(%rcx), %xmm1
- mov $14, %rsi
- movdqu %xmm1, -2(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl3):
- movaps -3(%rcx), %xmm1
- movaps 13(%rcx), %xmm2
-L(Shl3Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 29(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -13(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -3(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl3LoopStart):
- movaps 13(%rcx), %xmm2
- movaps 29(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 45(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 61(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $3, %xmm4, %xmm5
- test %rax, %rax
- palignr $3, %xmm3, %xmm4
- jnz L(Shl3Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave3)
-# endif
- palignr $3, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $3, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl3LoopStart)
-
-L(Shl3LoopExit):
- movdqu -3(%rcx), %xmm1
- mov $13, %rsi
- movdqu %xmm1, -3(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl4):
- movaps -4(%rcx), %xmm1
- movaps 12(%rcx), %xmm2
-L(Shl4Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 28(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -12(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -4(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl4LoopStart):
- movaps 12(%rcx), %xmm2
- movaps 28(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 44(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 60(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $4, %xmm4, %xmm5
- test %rax, %rax
- palignr $4, %xmm3, %xmm4
- jnz L(Shl4Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave4)
-# endif
- palignr $4, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $4, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl4LoopStart)
-
-L(Shl4LoopExit):
- movdqu -4(%rcx), %xmm1
- mov $12, %rsi
- movdqu %xmm1, -4(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl5):
- movaps -5(%rcx), %xmm1
- movaps 11(%rcx), %xmm2
-L(Shl5Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 27(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -11(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -5(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl5LoopStart):
- movaps 11(%rcx), %xmm2
- movaps 27(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 43(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 59(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $5, %xmm4, %xmm5
- test %rax, %rax
- palignr $5, %xmm3, %xmm4
- jnz L(Shl5Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave5)
-# endif
- palignr $5, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $5, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl5LoopStart)
-
-L(Shl5LoopExit):
- movdqu -5(%rcx), %xmm1
- mov $11, %rsi
- movdqu %xmm1, -5(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl6):
- movaps -6(%rcx), %xmm1
- movaps 10(%rcx), %xmm2
-L(Shl6Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 26(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -10(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -6(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl6LoopStart):
- movaps 10(%rcx), %xmm2
- movaps 26(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 42(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 58(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $6, %xmm4, %xmm5
- test %rax, %rax
- palignr $6, %xmm3, %xmm4
- jnz L(Shl6Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave6)
-# endif
- palignr $6, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $6, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl6LoopStart)
-
-L(Shl6LoopExit):
- mov (%rcx), %r9
- mov 6(%rcx), %esi
- mov %r9, (%rdx)
- mov %esi, 6(%rdx)
- mov $10, %rsi
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl7):
- movaps -7(%rcx), %xmm1
- movaps 9(%rcx), %xmm2
-L(Shl7Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 25(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -9(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -7(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl7LoopStart):
- movaps 9(%rcx), %xmm2
- movaps 25(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 41(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 57(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $7, %xmm4, %xmm5
- test %rax, %rax
- palignr $7, %xmm3, %xmm4
- jnz L(Shl7Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave7)
-# endif
- palignr $7, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $7, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl7LoopStart)
-
-L(Shl7LoopExit):
- mov (%rcx), %r9
- mov 5(%rcx), %esi
- mov %r9, (%rdx)
- mov %esi, 5(%rdx)
- mov $9, %rsi
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl8):
- movaps -8(%rcx), %xmm1
- movaps 8(%rcx), %xmm2
-L(Shl8Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 24(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -8(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -8(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl8LoopStart):
- movaps 8(%rcx), %xmm2
- movaps 24(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 40(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 56(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $8, %xmm4, %xmm5
- test %rax, %rax
- palignr $8, %xmm3, %xmm4
- jnz L(Shl8Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave8)
-# endif
- palignr $8, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $8, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl8LoopStart)
-
-L(Shl8LoopExit):
- mov (%rcx), %r9
- mov $8, %rsi
- mov %r9, (%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl9):
- movaps -9(%rcx), %xmm1
- movaps 7(%rcx), %xmm2
-L(Shl9Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 23(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -7(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -9(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl9LoopStart):
- movaps 7(%rcx), %xmm2
- movaps 23(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 39(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 55(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $9, %xmm4, %xmm5
- test %rax, %rax
- palignr $9, %xmm3, %xmm4
- jnz L(Shl9Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave9)
-# endif
- palignr $9, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $9, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl9LoopStart)
-
-L(Shl9LoopExit):
- mov -1(%rcx), %r9
- mov $7, %rsi
- mov %r9, -1(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl10):
- movaps -10(%rcx), %xmm1
- movaps 6(%rcx), %xmm2
-L(Shl10Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 22(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -6(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -10(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl10LoopStart):
- movaps 6(%rcx), %xmm2
- movaps 22(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 38(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 54(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $10, %xmm4, %xmm5
- test %rax, %rax
- palignr $10, %xmm3, %xmm4
- jnz L(Shl10Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave10)
-# endif
- palignr $10, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $10, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl10LoopStart)
-
-L(Shl10LoopExit):
- mov -2(%rcx), %r9
- mov $6, %rsi
- mov %r9, -2(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl11):
- movaps -11(%rcx), %xmm1
- movaps 5(%rcx), %xmm2
-L(Shl11Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 21(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -5(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -11(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl11LoopStart):
- movaps 5(%rcx), %xmm2
- movaps 21(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 37(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 53(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $11, %xmm4, %xmm5
- test %rax, %rax
- palignr $11, %xmm3, %xmm4
- jnz L(Shl11Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave11)
-# endif
- palignr $11, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $11, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl11LoopStart)
-
-L(Shl11LoopExit):
- mov -3(%rcx), %r9
- mov $5, %rsi
- mov %r9, -3(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl12):
- movaps -12(%rcx), %xmm1
- movaps 4(%rcx), %xmm2
-L(Shl12Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 20(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -4(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -12(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl12LoopStart):
- movaps 4(%rcx), %xmm2
- movaps 20(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 36(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 52(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $12, %xmm4, %xmm5
- test %rax, %rax
- palignr $12, %xmm3, %xmm4
- jnz L(Shl12Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave12)
-# endif
- palignr $12, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $12, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl12LoopStart)
-
-L(Shl12LoopExit):
- mov (%rcx), %r9d
- mov $4, %rsi
- mov %r9d, (%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl13):
- movaps -13(%rcx), %xmm1
- movaps 3(%rcx), %xmm2
-L(Shl13Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 19(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -3(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -13(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl13LoopStart):
- movaps 3(%rcx), %xmm2
- movaps 19(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 35(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 51(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $13, %xmm4, %xmm5
- test %rax, %rax
- palignr $13, %xmm3, %xmm4
- jnz L(Shl13Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave13)
-# endif
- palignr $13, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $13, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl13LoopStart)
-
-L(Shl13LoopExit):
- mov -1(%rcx), %r9d
- mov $3, %rsi
- mov %r9d, -1(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl14):
- movaps -14(%rcx), %xmm1
- movaps 2(%rcx), %xmm2
-L(Shl14Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 18(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -2(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -14(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl14LoopStart):
- movaps 2(%rcx), %xmm2
- movaps 18(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 34(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 50(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $14, %xmm4, %xmm5
- test %rax, %rax
- palignr $14, %xmm3, %xmm4
- jnz L(Shl14Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave14)
-# endif
- palignr $14, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $14, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl14LoopStart)
-
-L(Shl14LoopExit):
- mov -2(%rcx), %r9d
- mov $2, %rsi
- mov %r9d, -2(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl15):
- movaps -15(%rcx), %xmm1
- movaps 1(%rcx), %xmm2
-L(Shl15Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 17(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -1(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -15(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl15LoopStart):
- movaps 1(%rcx), %xmm2
- movaps 17(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 33(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 49(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $15, %xmm4, %xmm5
- test %rax, %rax
- palignr $15, %xmm3, %xmm4
- jnz L(Shl15Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave15)
-# endif
- palignr $15, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $15, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl15LoopStart)
-
-L(Shl15LoopExit):
- mov -3(%rcx), %r9d
- mov $1, %rsi
- mov %r9d, -3(%rdx)
-# ifdef USE_AS_STRCAT
- jmp L(CopyFrom1To16Bytes)
-# endif
-
-# ifndef USE_AS_STRCAT
-
- .p2align 4
-L(CopyFrom1To16Bytes):
-# ifdef USE_AS_STRNCPY
- add $16, %r8
-# endif
- add %rsi, %rdx
- add %rsi, %rcx
-
- test %al, %al
- jz L(ExitHigh)
- test $0x01, %al
- jnz L(Exit1)
- test $0x02, %al
- jnz L(Exit2)
- test $0x04, %al
- jnz L(Exit3)
- test $0x08, %al
- jnz L(Exit4)
- test $0x10, %al
- jnz L(Exit5)
- test $0x20, %al
- jnz L(Exit6)
- test $0x40, %al
- jnz L(Exit7)
-
- .p2align 4
-L(Exit8):
- mov (%rcx), %rax
- mov %rax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 7(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $8, %r8
- lea 8(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(ExitHigh):
- test $0x01, %ah
- jnz L(Exit9)
- test $0x02, %ah
- jnz L(Exit10)
- test $0x04, %ah
- jnz L(Exit11)
- test $0x08, %ah
- jnz L(Exit12)
- test $0x10, %ah
- jnz L(Exit13)
- test $0x20, %ah
- jnz L(Exit14)
- test $0x40, %ah
- jnz L(Exit15)
-
- .p2align 4
-L(Exit16):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 8(%rcx), %rax
- mov %rax, 8(%rdx)
-# ifdef USE_AS_STPCPY
- lea 15(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- lea 16(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
-# ifdef USE_AS_STRNCPY
-
- .p2align 4
-L(CopyFrom1To16BytesCase2):
- add $16, %r8
- add %rsi, %rcx
- lea (%rsi, %rdx), %rsi
- lea -9(%r8), %rdx
- and $1<<7, %dh
- or %al, %dh
- test %dh, %dh
- lea (%rsi), %rdx
- jz L(ExitHighCase2)
-
- cmp $1, %r8
- je L(Exit1)
- test $0x01, %al
- jnz L(Exit1)
- cmp $2, %r8
- je L(Exit2)
- test $0x02, %al
- jnz L(Exit2)
- cmp $3, %r8
- je L(Exit3)
- test $0x04, %al
- jnz L(Exit3)
- cmp $4, %r8
- je L(Exit4)
- test $0x08, %al
- jnz L(Exit4)
- cmp $5, %r8
- je L(Exit5)
- test $0x10, %al
- jnz L(Exit5)
- cmp $6, %r8
- je L(Exit6)
- test $0x20, %al
- jnz L(Exit6)
- cmp $7, %r8
- je L(Exit7)
- test $0x40, %al
- jnz L(Exit7)
- jmp L(Exit8)
-
- .p2align 4
-L(ExitHighCase2):
- cmp $9, %r8
- je L(Exit9)
- test $0x01, %ah
- jnz L(Exit9)
- cmp $10, %r8
- je L(Exit10)
- test $0x02, %ah
- jnz L(Exit10)
- cmp $11, %r8
- je L(Exit11)
- test $0x04, %ah
- jnz L(Exit11)
- cmp $12, %r8
- je L(Exit12)
- test $0x8, %ah
- jnz L(Exit12)
- cmp $13, %r8
- je L(Exit13)
- test $0x10, %ah
- jnz L(Exit13)
- cmp $14, %r8
- je L(Exit14)
- test $0x20, %ah
- jnz L(Exit14)
- cmp $15, %r8
- je L(Exit15)
- test $0x40, %ah
- jnz L(Exit15)
- jmp L(Exit16)
-
-L(CopyFrom1To16BytesCase2OrCase3):
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
-
- .p2align 4
-L(CopyFrom1To16BytesCase3):
- add $16, %r8
- add %rsi, %rdx
- add %rsi, %rcx
-
- cmp $16, %r8
- je L(Exit16)
- cmp $8, %r8
- je L(Exit8)
- jg L(More8Case3)
- cmp $4, %r8
- je L(Exit4)
- jg L(More4Case3)
- cmp $2, %r8
- jl L(Exit1)
- je L(Exit2)
- jg L(Exit3)
-L(More8Case3): /* but less than 16 */
- cmp $12, %r8
- je L(Exit12)
- jl L(Less12Case3)
- cmp $14, %r8
- jl L(Exit13)
- je L(Exit14)
- jg L(Exit15)
-L(More4Case3): /* but less than 8 */
- cmp $6, %r8
- jl L(Exit5)
- je L(Exit6)
- jg L(Exit7)
-L(Less12Case3): /* but more than 8 */
- cmp $10, %r8
- jl L(Exit9)
- je L(Exit10)
- jg L(Exit11)
-# endif
-
- .p2align 4
-L(Exit1):
- movb (%rcx), %al
- movb %al, (%rdx)
-# ifdef USE_AS_STPCPY
- lea (%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $1, %r8
- lea 1(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit2):
- movw (%rcx), %ax
- movw %ax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 1(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $2, %r8
- lea 2(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit3):
- movw (%rcx), %ax
- movw %ax, (%rdx)
- movb 2(%rcx), %al
- movb %al, 2(%rdx)
-# ifdef USE_AS_STPCPY
- lea 2(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $3, %r8
- lea 3(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit4):
- movl (%rcx), %eax
- movl %eax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 3(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $4, %r8
- lea 4(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit5):
- movl (%rcx), %eax
- movl %eax, (%rdx)
- movb 4(%rcx), %al
- movb %al, 4(%rdx)
-# ifdef USE_AS_STPCPY
- lea 4(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $5, %r8
- lea 5(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit6):
- movl (%rcx), %eax
- movl %eax, (%rdx)
- movw 4(%rcx), %ax
- movw %ax, 4(%rdx)
-# ifdef USE_AS_STPCPY
- lea 5(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $6, %r8
- lea 6(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit7):
- movl (%rcx), %eax
- movl %eax, (%rdx)
- movl 3(%rcx), %eax
- movl %eax, 3(%rdx)
-# ifdef USE_AS_STPCPY
- lea 6(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $7, %r8
- lea 7(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit9):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 5(%rcx), %eax
- mov %eax, 5(%rdx)
-# ifdef USE_AS_STPCPY
- lea 8(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $9, %r8
- lea 9(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit10):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 6(%rcx), %eax
- mov %eax, 6(%rdx)
-# ifdef USE_AS_STPCPY
- lea 9(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $10, %r8
- lea 10(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit11):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 7(%rcx), %eax
- mov %eax, 7(%rdx)
-# ifdef USE_AS_STPCPY
- lea 10(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $11, %r8
- lea 11(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit12):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 8(%rcx), %eax
- mov %eax, 8(%rdx)
-# ifdef USE_AS_STPCPY
- lea 11(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $12, %r8
- lea 12(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit13):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 5(%rcx), %rax
- mov %rax, 5(%rdx)
-# ifdef USE_AS_STPCPY
- lea 12(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $13, %r8
- lea 13(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit14):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 6(%rcx), %rax
- mov %rax, 6(%rdx)
-# ifdef USE_AS_STPCPY
- lea 13(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $14, %r8
- lea 14(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit15):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 7(%rcx), %rax
- mov %rax, 7(%rdx)
-# ifdef USE_AS_STPCPY
- lea 14(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $15, %r8
- lea 15(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
-# ifdef USE_AS_STRNCPY
- .p2align 4
-L(Fill0):
- ret
-
- .p2align 4
-L(Fill1):
- movb %dl, (%rcx)
- ret
-
- .p2align 4
-L(Fill2):
- movw %dx, (%rcx)
- ret
-
- .p2align 4
-L(Fill3):
- movw %dx, (%rcx)
- movb %dl, 2(%rcx)
- ret
-
- .p2align 4
-L(Fill4):
- movl %edx, (%rcx)
- ret
-
- .p2align 4
-L(Fill5):
- movl %edx, (%rcx)
- movb %dl, 4(%rcx)
- ret
-
- .p2align 4
-L(Fill6):
- movl %edx, (%rcx)
- movw %dx, 4(%rcx)
- ret
-
- .p2align 4
-L(Fill7):
- movl %edx, (%rcx)
- movl %edx, 3(%rcx)
- ret
-
- .p2align 4
-L(Fill8):
- mov %rdx, (%rcx)
- ret
-
- .p2align 4
-L(Fill9):
- mov %rdx, (%rcx)
- movb %dl, 8(%rcx)
- ret
-
- .p2align 4
-L(Fill10):
- mov %rdx, (%rcx)
- movw %dx, 8(%rcx)
- ret
-
- .p2align 4
-L(Fill11):
- mov %rdx, (%rcx)
- movl %edx, 7(%rcx)
- ret
-
- .p2align 4
-L(Fill12):
- mov %rdx, (%rcx)
- movl %edx, 8(%rcx)
- ret
-
- .p2align 4
-L(Fill13):
- mov %rdx, (%rcx)
- mov %rdx, 5(%rcx)
- ret
-
- .p2align 4
-L(Fill14):
- mov %rdx, (%rcx)
- mov %rdx, 6(%rcx)
- ret
-
- .p2align 4
-L(Fill15):
- mov %rdx, (%rcx)
- mov %rdx, 7(%rcx)
- ret
-
- .p2align 4
-L(Fill16):
- mov %rdx, (%rcx)
- mov %rdx, 8(%rcx)
- ret
-
- .p2align 4
-L(StrncpyFillExit1):
- lea 16(%r8), %r8
-L(FillFrom1To16Bytes):
- test %r8, %r8
- jz L(Fill0)
- cmp $16, %r8
- je L(Fill16)
- cmp $8, %r8
- je L(Fill8)
- jg L(FillMore8)
- cmp $4, %r8
- je L(Fill4)
- jg L(FillMore4)
- cmp $2, %r8
- jl L(Fill1)
- je L(Fill2)
- jg L(Fill3)
-L(FillMore8): /* but less than 16 */
- cmp $12, %r8
- je L(Fill12)
- jl L(FillLess12)
- cmp $14, %r8
- jl L(Fill13)
- je L(Fill14)
- jg L(Fill15)
-L(FillMore4): /* but less than 8 */
- cmp $6, %r8
- jl L(Fill5)
- je L(Fill6)
- jg L(Fill7)
-L(FillLess12): /* but more than 8 */
- cmp $10, %r8
- jl L(Fill9)
- je L(Fill10)
- jmp L(Fill11)
-
- .p2align 4
-L(StrncpyFillTailWithZero1):
- xor %rdx, %rdx
- sub $16, %r8
- jbe L(StrncpyFillExit1)
-
- pxor %xmm0, %xmm0
- mov %rdx, (%rcx)
- mov %rdx, 8(%rcx)
-
- lea 16(%rcx), %rcx
-
- mov %rcx, %rdx
- and $0xf, %rdx
- sub %rdx, %rcx
- add %rdx, %r8
- xor %rdx, %rdx
- sub $64, %r8
- jb L(StrncpyFillLess64)
-
-L(StrncpyFillLoopMovdqa):
- movdqa %xmm0, (%rcx)
- movdqa %xmm0, 16(%rcx)
- movdqa %xmm0, 32(%rcx)
- movdqa %xmm0, 48(%rcx)
- lea 64(%rcx), %rcx
- sub $64, %r8
- jae L(StrncpyFillLoopMovdqa)
-
-L(StrncpyFillLess64):
- add $32, %r8
- jl L(StrncpyFillLess32)
- movdqa %xmm0, (%rcx)
- movdqa %xmm0, 16(%rcx)
- lea 32(%rcx), %rcx
- sub $16, %r8
- jl L(StrncpyFillExit1)
- movdqa %xmm0, (%rcx)
- lea 16(%rcx), %rcx
- jmp L(FillFrom1To16Bytes)
-
-L(StrncpyFillLess32):
- add $16, %r8
- jl L(StrncpyFillExit1)
- movdqa %xmm0, (%rcx)
- lea 16(%rcx), %rcx
- jmp L(FillFrom1To16Bytes)
-
- .p2align 4
-L(Exit0):
- mov %rdx, %rax
- ret
-
- .p2align 4
-L(StrncpyExit15Bytes):
- cmp $9, %r8
- je L(Exit9)
- cmpb $0, 8(%rcx)
- jz L(Exit9)
- cmp $10, %r8
- je L(Exit10)
- cmpb $0, 9(%rcx)
- jz L(Exit10)
- cmp $11, %r8
- je L(Exit11)
- cmpb $0, 10(%rcx)
- jz L(Exit11)
- cmp $12, %r8
- je L(Exit12)
- cmpb $0, 11(%rcx)
- jz L(Exit12)
- cmp $13, %r8
- je L(Exit13)
- cmpb $0, 12(%rcx)
- jz L(Exit13)
- cmp $14, %r8
- je L(Exit14)
- cmpb $0, 13(%rcx)
- jz L(Exit14)
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 7(%rcx), %rax
- mov %rax, 7(%rdx)
-# ifdef USE_AS_STPCPY
- lea 14(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
-# else
- mov %rdi, %rax
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit8Bytes):
- cmp $1, %r8
- je L(Exit1)
- cmpb $0, (%rcx)
- jz L(Exit1)
- cmp $2, %r8
- je L(Exit2)
- cmpb $0, 1(%rcx)
- jz L(Exit2)
- cmp $3, %r8
- je L(Exit3)
- cmpb $0, 2(%rcx)
- jz L(Exit3)
- cmp $4, %r8
- je L(Exit4)
- cmpb $0, 3(%rcx)
- jz L(Exit4)
- cmp $5, %r8
- je L(Exit5)
- cmpb $0, 4(%rcx)
- jz L(Exit5)
- cmp $6, %r8
- je L(Exit6)
- cmpb $0, 5(%rcx)
- jz L(Exit6)
- cmp $7, %r8
- je L(Exit7)
- cmpb $0, 6(%rcx)
- jz L(Exit7)
- mov (%rcx), %rax
- mov %rax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 7(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
-# else
- mov %rdi, %rax
-# endif
- ret
-
-# endif
-# endif
-
-# ifdef USE_AS_STRNCPY
- .p2align 4
-L(StrncpyLeaveCase2OrCase3):
- test %rax, %rax
- jnz L(Aligned64LeaveCase2)
-
-L(Aligned64LeaveCase3):
- lea 64(%r8), %r8
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase3)
- movaps %xmm4, -64(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase3)
- movaps %xmm5, -48(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase3)
- movaps %xmm6, -32(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
- jmp L(CopyFrom1To16BytesCase3)
-
-L(Aligned64LeaveCase2):
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- add $48, %r8
- jle L(CopyFrom1To16BytesCase2OrCase3)
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm5, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm4, -64(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm6, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm5, -48(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm6, -32(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
- jmp L(CopyFrom1To16BytesCase2)
-/*--------------------------------------------------*/
- .p2align 4
-L(StrncpyExit1Case2OrCase3):
- movdqu -1(%rcx), %xmm0
- movdqu %xmm0, -1(%rdx)
- mov $15, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit2Case2OrCase3):
- movdqu -2(%rcx), %xmm0
- movdqu %xmm0, -2(%rdx)
- mov $14, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit3Case2OrCase3):
- movdqu -3(%rcx), %xmm0
- movdqu %xmm0, -3(%rdx)
- mov $13, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit4Case2OrCase3):
- movdqu -4(%rcx), %xmm0
- movdqu %xmm0, -4(%rdx)
- mov $12, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit5Case2OrCase3):
- movdqu -5(%rcx), %xmm0
- movdqu %xmm0, -5(%rdx)
- mov $11, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit6Case2OrCase3):
- mov (%rcx), %rsi
- mov 6(%rcx), %r9d
- mov %r9d, 6(%rdx)
- mov %rsi, (%rdx)
- test %rax, %rax
- mov $10, %rsi
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit7Case2OrCase3):
- mov (%rcx), %rsi
- mov 5(%rcx), %r9d
- mov %r9d, 5(%rdx)
- mov %rsi, (%rdx)
- test %rax, %rax
- mov $9, %rsi
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit8Case2OrCase3):
- mov (%rcx), %r9
- mov $8, %rsi
- mov %r9, (%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit9Case2OrCase3):
- mov -1(%rcx), %r9
- mov $7, %rsi
- mov %r9, -1(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit10Case2OrCase3):
- mov -2(%rcx), %r9
- mov $6, %rsi
- mov %r9, -2(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit11Case2OrCase3):
- mov -3(%rcx), %r9
- mov $5, %rsi
- mov %r9, -3(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit12Case2OrCase3):
- mov (%rcx), %r9d
- mov $4, %rsi
- mov %r9d, (%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit13Case2OrCase3):
- mov -1(%rcx), %r9d
- mov $3, %rsi
- mov %r9d, -1(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit14Case2OrCase3):
- mov -2(%rcx), %r9d
- mov $2, %rsi
- mov %r9d, -2(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit15Case2OrCase3):
- mov -3(%rcx), %r9d
- mov $1, %rsi
- mov %r9d, -3(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave1):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit1)
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit1)
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit1)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit1)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit1):
- lea 15(%rdx, %rsi), %rdx
- lea 15(%rcx, %rsi), %rcx
- mov -15(%rcx), %rsi
- mov -8(%rcx), %rax
- mov %rsi, -15(%rdx)
- mov %rax, -8(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave2):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit2)
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit2)
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit2)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit2)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit2):
- lea 14(%rdx, %rsi), %rdx
- lea 14(%rcx, %rsi), %rcx
- mov -14(%rcx), %rsi
- mov -8(%rcx), %rax
- mov %rsi, -14(%rdx)
- mov %rax, -8(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave3):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit3)
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit3)
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit3)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit3)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit3):
- lea 13(%rdx, %rsi), %rdx
- lea 13(%rcx, %rsi), %rcx
- mov -13(%rcx), %rsi
- mov -8(%rcx), %rax
- mov %rsi, -13(%rdx)
- mov %rax, -8(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave4):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit4)
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit4)
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit4)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit4)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit4):
- lea 12(%rdx, %rsi), %rdx
- lea 12(%rcx, %rsi), %rcx
- mov -12(%rcx), %rsi
- mov -4(%rcx), %eax
- mov %rsi, -12(%rdx)
- mov %eax, -4(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave5):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit5)
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit5)
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit5)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit5)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit5):
- lea 11(%rdx, %rsi), %rdx
- lea 11(%rcx, %rsi), %rcx
- mov -11(%rcx), %rsi
- mov -4(%rcx), %eax
- mov %rsi, -11(%rdx)
- mov %eax, -4(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave6):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit6)
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit6)
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit6)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit6)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit6):
- lea 10(%rdx, %rsi), %rdx
- lea 10(%rcx, %rsi), %rcx
- mov -10(%rcx), %rsi
- movw -2(%rcx), %ax
- mov %rsi, -10(%rdx)
- movw %ax, -2(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave7):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit7)
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit7)
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit7)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit7)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit7):
- lea 9(%rdx, %rsi), %rdx
- lea 9(%rcx, %rsi), %rcx
- mov -9(%rcx), %rsi
- movb -1(%rcx), %ah
- mov %rsi, -9(%rdx)
- movb %ah, -1(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave8):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit8)
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit8)
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit8)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit8)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit8):
- lea 8(%rdx, %rsi), %rdx
- lea 8(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave9):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit9)
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit9)
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit9)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit9)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit9):
- lea 7(%rdx, %rsi), %rdx
- lea 7(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave10):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit10)
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit10)
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit10)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit10)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit10):
- lea 6(%rdx, %rsi), %rdx
- lea 6(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave11):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit11)
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit11)
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit11)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit11)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit11):
- lea 5(%rdx, %rsi), %rdx
- lea 5(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave12):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit12)
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit12)
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit12)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit12)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit12):
- lea 4(%rdx, %rsi), %rdx
- lea 4(%rcx, %rsi), %rcx
- mov -4(%rcx), %eax
- xor %rsi, %rsi
- mov %eax, -4(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave13):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit13)
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit13)
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit13)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit13)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit13):
- lea 3(%rdx, %rsi), %rdx
- lea 3(%rcx, %rsi), %rcx
- mov -4(%rcx), %eax
- xor %rsi, %rsi
- mov %eax, -4(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave14):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit14)
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit14)
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit14)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit14)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit14):
- lea 2(%rdx, %rsi), %rdx
- lea 2(%rcx, %rsi), %rcx
- movw -2(%rcx), %ax
- xor %rsi, %rsi
- movw %ax, -2(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave15):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit15)
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit15)
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit15)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit15)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit15):
- lea 1(%rdx, %rsi), %rdx
- lea 1(%rcx, %rsi), %rcx
- movb -1(%rcx), %ah
- xor %rsi, %rsi
- movb %ah, -1(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
-# endif
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S
deleted file mode 100644
index 77819ddc50..0000000000
--- a/sysdeps/x86_64/multiarch/strcpy.S
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Multiple versions of strcpy
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2009-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY)
-# ifndef STRCPY
-# define STRCPY strcpy
-# endif
-#endif
-
-#ifdef USE_AS_STPCPY
-# ifdef USE_AS_STRNCPY
-# define STRCPY_SSSE3 __stpncpy_ssse3
-# define STRCPY_SSE2 __stpncpy_sse2
-# define STRCPY_SSE2_UNALIGNED __stpncpy_sse2_unaligned
-# define __GI_STRCPY __GI_stpncpy
-# define __GI___STRCPY __GI___stpncpy
-# else
-# define STRCPY_SSSE3 __stpcpy_ssse3
-# define STRCPY_SSE2 __stpcpy_sse2
-# define STRCPY_SSE2_UNALIGNED __stpcpy_sse2_unaligned
-# define __GI_STRCPY __GI_stpcpy
-# define __GI___STRCPY __GI___stpcpy
-# endif
-#else
-# ifdef USE_AS_STRNCPY
-# define STRCPY_SSSE3 __strncpy_ssse3
-# define STRCPY_SSE2 __strncpy_sse2
-# define STRCPY_SSE2_UNALIGNED __strncpy_sse2_unaligned
-# define __GI_STRCPY __GI_strncpy
-# else
-# define STRCPY_SSSE3 __strcpy_ssse3
-# define STRCPY_SSE2 __strcpy_sse2
-# define STRCPY_SSE2_UNALIGNED __strcpy_sse2_unaligned
-# define __GI_STRCPY __GI_strcpy
-# endif
-#endif
-
-
-/* Define multiple versions only for the definition in libc. */
-#if IS_IN (libc)
- .text
-ENTRY(STRCPY)
- .type STRCPY, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- leaq STRCPY_SSE2_UNALIGNED(%rip), %rax
- HAS_ARCH_FEATURE (Fast_Unaligned_Load)
- jnz 2f
- leaq STRCPY_SSE2(%rip), %rax
- HAS_CPU_FEATURE (SSSE3)
- jz 2f
- leaq STRCPY_SSSE3(%rip), %rax
-2: ret
-END(STRCPY)
-
-# undef ENTRY
-# define ENTRY(name) \
- .type STRCPY_SSE2, @function; \
- .align 16; \
- .globl STRCPY_SSE2; \
- .hidden STRCPY_SSE2; \
- STRCPY_SSE2: cfi_startproc; \
- CALL_MCOUNT
-# undef END
-# define END(name) \
- cfi_endproc; .size STRCPY_SSE2, .-STRCPY_SSE2
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal strcpy calls through a PLT.
- The speedup we get from using SSSE3 instruction is likely eaten away
- by the indirect call in the PLT. */
-# define libc_hidden_builtin_def(name) \
- .globl __GI_STRCPY; __GI_STRCPY = STRCPY_SSE2
-# undef libc_hidden_def
-# define libc_hidden_def(name) \
- .globl __GI___STRCPY; __GI___STRCPY = STRCPY_SSE2
-#endif
-
-#ifndef USE_AS_STRNCPY
-#include "../strcpy.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
deleted file mode 100644
index 67991b5ca7..0000000000
--- a/sysdeps/x86_64/multiarch/strcspn-c.c
+++ /dev/null
@@ -1,173 +0,0 @@
-/* strcspn with SSE4.2 intrinsics
- Copyright (C) 2009-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <nmmintrin.h>
-#include <string.h>
-#include "varshift.h"
-
-/* We use 0x2:
- _SIDD_SBYTE_OPS
- | _SIDD_CMP_EQUAL_ANY
- | _SIDD_POSITIVE_POLARITY
- | _SIDD_LEAST_SIGNIFICANT
- on pcmpistri to compare xmm/mem128
-
- 0 1 2 3 4 5 6 7 8 9 A B C D E F
- X X X X X X X X X X X X X X X X
-
- against xmm
-
- 0 1 2 3 4 5 6 7 8 9 A B C D E F
- A A A A A A A A A A A A A A A A
-
- to find out if the first 16byte data element has any byte A and
- the offset of the first byte. There are 3 cases:
-
- 1. The first 16byte data element has the byte A at the offset X.
- 2. The first 16byte data element has EOS and doesn't have the byte A.
- 3. The first 16byte data element is valid and doesn't have the byte A.
-
- Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
-
- 1 X 1 0/1 0
- 2 16 0 1 0
- 3 16 0 0 0
-
- We exit from the loop for cases 1 and 2 with jbe which branches
- when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset
- X for case 1. */
-
-#ifndef STRCSPN_SSE2
-# define STRCSPN_SSE2 __strcspn_sse2
-# define STRCSPN_SSE42 __strcspn_sse42
-#endif
-
-#ifdef USE_AS_STRPBRK
-# define RETURN(val1, val2) return val1
-#else
-# define RETURN(val1, val2) return val2
-#endif
-
-extern
-#ifdef USE_AS_STRPBRK
-char *
-#else
-size_t
-#endif
-STRCSPN_SSE2 (const char *, const char *);
-
-
-#ifdef USE_AS_STRPBRK
-char *
-#else
-size_t
-#endif
-__attribute__ ((section (".text.sse4.2")))
-STRCSPN_SSE42 (const char *s, const char *a)
-{
- if (*a == 0)
- RETURN (NULL, strlen (s));
-
- const char *aligned;
- __m128i mask;
- int offset = (int) ((size_t) a & 15);
- if (offset != 0)
- {
- /* Load masks. */
- aligned = (const char *) ((size_t) a & -16L);
- __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
-
- mask = __m128i_shift_right (mask0, offset);
-
- /* Find where the NULL terminator is. */
- int length = _mm_cmpistri (mask, mask, 0x3a);
- if (length == 16 - offset)
- {
- /* There is no NULL terminator. */
- __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
- int index = _mm_cmpistri (mask1, mask1, 0x3a);
- length += index;
-
- /* Don't use SSE4.2 if the length of A > 16. */
- if (length > 16)
- return STRCSPN_SSE2 (s, a);
-
- if (index != 0)
- {
- /* Combine mask0 and mask1. We could play games with
- palignr, but frankly this data should be in L1 now
- so do the merge via an unaligned load. */
- mask = _mm_loadu_si128 ((__m128i *) a);
- }
- }
- }
- else
- {
- /* A is aligned. */
- mask = _mm_load_si128 ((__m128i *) a);
-
- /* Find where the NULL terminator is. */
- int length = _mm_cmpistri (mask, mask, 0x3a);
- if (length == 16)
- {
- /* There is no NULL terminator. Don't use SSE4.2 if the length
- of A > 16. */
- if (a[16] != 0)
- return STRCSPN_SSE2 (s, a);
- }
- }
-
- offset = (int) ((size_t) s & 15);
- if (offset != 0)
- {
- /* Check partial string. */
- aligned = (const char *) ((size_t) s & -16L);
- __m128i value = _mm_load_si128 ((__m128i *) aligned);
-
- value = __m128i_shift_right (value, offset);
-
- int length = _mm_cmpistri (mask, value, 0x2);
- /* No need to check ZFlag since ZFlag is always 1. */
- int cflag = _mm_cmpistrc (mask, value, 0x2);
- if (cflag)
- RETURN ((char *) (s + length), length);
- /* Find where the NULL terminator is. */
- int index = _mm_cmpistri (value, value, 0x3a);
- if (index < 16 - offset)
- RETURN (NULL, index);
- aligned += 16;
- }
- else
- aligned = s;
-
- while (1)
- {
- __m128i value = _mm_load_si128 ((__m128i *) aligned);
- int index = _mm_cmpistri (mask, value, 0x2);
- int cflag = _mm_cmpistrc (mask, value, 0x2);
- int zflag = _mm_cmpistrz (mask, value, 0x2);
- if (cflag)
- RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
- if (zflag)
- RETURN (NULL,
- /* Find where the NULL terminator is. */
- (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s));
- aligned += 16;
- }
-}
diff --git a/sysdeps/x86_64/multiarch/strcspn.S b/sysdeps/x86_64/multiarch/strcspn.S
deleted file mode 100644
index d102c7e80b..0000000000
--- a/sysdeps/x86_64/multiarch/strcspn.S
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Multiple versions of strcspn
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2009-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <config.h>
-#include <sysdep.h>
-#include <init-arch.h>
-
-#ifdef USE_AS_STRPBRK
-#define STRCSPN_SSE42 __strpbrk_sse42
-#define STRCSPN_SSE2 __strpbrk_sse2
-#define __GI_STRCSPN __GI_strpbrk
-#else
-#ifndef STRCSPN
-#define STRCSPN strcspn
-#define STRCSPN_SSE42 __strcspn_sse42
-#define STRCSPN_SSE2 __strcspn_sse2
-#define __GI_STRCSPN __GI_strcspn
-#endif
-#endif
-
-/* Define multiple versions only for the definition in libc. Don't
- define multiple versions for strpbrk in static library since we
- need strpbrk before the initialization happened. */
-#if (defined SHARED || !defined USE_AS_STRPBRK) && IS_IN (libc)
- .text
-ENTRY(STRCSPN)
- .type STRCSPN, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- leaq STRCSPN_SSE2(%rip), %rax
- HAS_CPU_FEATURE (SSE4_2)
- jz 2f
- leaq STRCSPN_SSE42(%rip), %rax
-2: ret
-END(STRCSPN)
-
-# undef ENTRY
-# define ENTRY(name) \
- .type STRCSPN_SSE2, @function; \
- .globl STRCSPN_SSE2; \
- .align 16; \
- STRCSPN_SSE2: cfi_startproc; \
- CALL_MCOUNT
-# undef END
-# define END(name) \
- cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2
-#endif
-
-#ifdef USE_AS_STRPBRK
-#include "../strpbrk.S"
-#else
-#include "../strcspn.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S b/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
deleted file mode 100644
index 6728678688..0000000000
--- a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#define USE_SSSE3 1
-#define USE_AS_STRNCASECMP_L
-#define NO_NOLOCALE_ALIAS
-#define STRCMP __strncasecmp_l_ssse3
-#define __strncasecmp __strncasecmp_ssse3
-#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strncase_l.S b/sysdeps/x86_64/multiarch/strncase_l.S
deleted file mode 100644
index 9c0149788e..0000000000
--- a/sysdeps/x86_64/multiarch/strncase_l.S
+++ /dev/null
@@ -1,8 +0,0 @@
-/* Multiple versions of strncasecmp and strncasecmp_l
- All versions must be listed in ifunc-impl-list.c. */
-#define STRCMP __strncasecmp_l
-#define USE_AS_STRNCASECMP_L
-#include "strcmp.S"
-
-weak_alias (__strncasecmp_l, strncasecmp_l)
-libc_hidden_def (strncasecmp_l)
diff --git a/sysdeps/x86_64/multiarch/strncat-c.c b/sysdeps/x86_64/multiarch/strncat-c.c
deleted file mode 100644
index a3cdbff689..0000000000
--- a/sysdeps/x86_64/multiarch/strncat-c.c
+++ /dev/null
@@ -1,8 +0,0 @@
-#define STRNCAT __strncat_sse2
-#ifdef SHARED
-#undef libc_hidden_def
-#define libc_hidden_def(name) \
- __hidden_ver1 (__strncat_sse2, __GI___strncat, __strncat_sse2);
-#endif
-
-#include "string/strncat.c"
diff --git a/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
deleted file mode 100644
index 133e1d20b0..0000000000
--- a/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCAT
-#define STRCAT __strncat_sse2_unaligned
-#include "strcat-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S b/sysdeps/x86_64/multiarch/strncat-ssse3.S
deleted file mode 100644
index 6c45ff3ec7..0000000000
--- a/sysdeps/x86_64/multiarch/strncat-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCAT
-#define STRCAT __strncat_ssse3
-#include "strcat-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/strncat.S b/sysdeps/x86_64/multiarch/strncat.S
deleted file mode 100644
index 5c1bf41453..0000000000
--- a/sysdeps/x86_64/multiarch/strncat.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Multiple versions of strncat
- All versions must be listed in ifunc-impl-list.c. */
-#define STRCAT strncat
-#define USE_AS_STRNCAT
-#include "strcat.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/sysdeps/x86_64/multiarch/strncmp-ssse3.S
deleted file mode 100644
index 96380a46be..0000000000
--- a/sysdeps/x86_64/multiarch/strncmp-ssse3.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifdef SHARED
-# define USE_SSSE3 1
-# define STRCMP __strncmp_ssse3
-# define USE_AS_STRNCMP
-# include "../strcmp.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/strncmp.S b/sysdeps/x86_64/multiarch/strncmp.S
deleted file mode 100644
index fd5eb1397c..0000000000
--- a/sysdeps/x86_64/multiarch/strncmp.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Multiple versions of strncmp
- All versions must be listed in ifunc-impl-list.c. */
-#define STRCMP strncmp
-#define USE_AS_STRNCMP
-#include "strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strncpy-c.c b/sysdeps/x86_64/multiarch/strncpy-c.c
deleted file mode 100644
index 296c32cb5d..0000000000
--- a/sysdeps/x86_64/multiarch/strncpy-c.c
+++ /dev/null
@@ -1,8 +0,0 @@
-#define STRNCPY __strncpy_sse2
-#ifdef SHARED
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(name) \
- __hidden_ver1 (__strncpy_sse2, __GI_strncpy, __strncpy_sse2);
-#endif
-
-#include "strncpy.c"
diff --git a/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S
deleted file mode 100644
index fcc23a754a..0000000000
--- a/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCPY
-#define STRCPY __strncpy_sse2_unaligned
-#include "strcpy-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
deleted file mode 100644
index bf82ee447d..0000000000
--- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCPY
-#define STRCPY __strncpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/strncpy.S b/sysdeps/x86_64/multiarch/strncpy.S
deleted file mode 100644
index 6d87a0ba35..0000000000
--- a/sysdeps/x86_64/multiarch/strncpy.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Multiple versions of strncpy
- All versions must be listed in ifunc-impl-list.c. */
-#define STRCPY strncpy
-#define USE_AS_STRNCPY
-#include "strcpy.S"
diff --git a/sysdeps/x86_64/multiarch/strpbrk-c.c b/sysdeps/x86_64/multiarch/strpbrk-c.c
deleted file mode 100644
index bbf5c49d89..0000000000
--- a/sysdeps/x86_64/multiarch/strpbrk-c.c
+++ /dev/null
@@ -1,8 +0,0 @@
-/* Don't define multiple versions for strpbrk in static library since we
- need strpbrk before the initialization happened. */
-#ifdef SHARED
-# define USE_AS_STRPBRK
-# define STRCSPN_SSE2 __strpbrk_sse2
-# define STRCSPN_SSE42 __strpbrk_sse42
-# include "strcspn-c.c"
-#endif
diff --git a/sysdeps/x86_64/multiarch/strpbrk.S b/sysdeps/x86_64/multiarch/strpbrk.S
deleted file mode 100644
index 7201d6376f..0000000000
--- a/sysdeps/x86_64/multiarch/strpbrk.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Multiple versions of strpbrk
- All versions must be listed in ifunc-impl-list.c. */
-#define STRCSPN strpbrk
-#define USE_AS_STRPBRK
-#include "strcspn.S"
diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
deleted file mode 100644
index 1704606b80..0000000000
--- a/sysdeps/x86_64/multiarch/strspn-c.c
+++ /dev/null
@@ -1,145 +0,0 @@
-/* strspn with SSE4.2 intrinsics
- Copyright (C) 2009-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <nmmintrin.h>
-#include <string.h>
-#include "varshift.h"
-
-/* We use 0x12:
- _SIDD_SBYTE_OPS
- | _SIDD_CMP_EQUAL_ANY
- | _SIDD_NEGATIVE_POLARITY
- | _SIDD_LEAST_SIGNIFICANT
- on pcmpistri to compare xmm/mem128
-
- 0 1 2 3 4 5 6 7 8 9 A B C D E F
- X X X X X X X X X X X X X X X X
-
- against xmm
-
- 0 1 2 3 4 5 6 7 8 9 A B C D E F
- A A A A A A A A A A A A A A A A
-
- to find out if the first 16byte data element has any non-A byte and
- the offset of the first byte. There are 2 cases:
-
- 1. The first 16byte data element has the non-A byte, including
- EOS, at the offset X.
- 2. The first 16byte data element is valid and doesn't have the non-A
- byte.
-
- Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
-
- case ECX CFlag ZFlag SFlag
- 1 X 1 0/1 0
- 2 16 0 0 0
-
- We exit from the loop for case 1. */
-
-extern size_t __strspn_sse2 (const char *, const char *);
-
-
-size_t
-__attribute__ ((section (".text.sse4.2")))
-__strspn_sse42 (const char *s, const char *a)
-{
- if (*a == 0)
- return 0;
-
- const char *aligned;
- __m128i mask;
- int offset = (int) ((size_t) a & 15);
- if (offset != 0)
- {
- /* Load masks. */
- aligned = (const char *) ((size_t) a & -16L);
- __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
-
- mask = __m128i_shift_right (mask0, offset);
-
- /* Find where the NULL terminator is. */
- int length = _mm_cmpistri (mask, mask, 0x3a);
- if (length == 16 - offset)
- {
- /* There is no NULL terminator. */
- __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
- int index = _mm_cmpistri (mask1, mask1, 0x3a);
- length += index;
-
- /* Don't use SSE4.2 if the length of A > 16. */
- if (length > 16)
- return __strspn_sse2 (s, a);
-
- if (index != 0)
- {
- /* Combine mask0 and mask1. We could play games with
- palignr, but frankly this data should be in L1 now
- so do the merge via an unaligned load. */
- mask = _mm_loadu_si128 ((__m128i *) a);
- }
- }
- }
- else
- {
- /* A is aligned. */
- mask = _mm_load_si128 ((__m128i *) a);
-
- /* Find where the NULL terminator is. */
- int length = _mm_cmpistri (mask, mask, 0x3a);
- if (length == 16)
- {
- /* There is no NULL terminator. Don't use SSE4.2 if the length
- of A > 16. */
- if (a[16] != 0)
- return __strspn_sse2 (s, a);
- }
- }
-
- offset = (int) ((size_t) s & 15);
- if (offset != 0)
- {
- /* Check partial string. */
- aligned = (const char *) ((size_t) s & -16L);
- __m128i value = _mm_load_si128 ((__m128i *) aligned);
-
- value = __m128i_shift_right (value, offset);
-
- int length = _mm_cmpistri (mask, value, 0x12);
- /* No need to check CFlag since it is always 1. */
- if (length < 16 - offset)
- return length;
- /* Find where the NULL terminator is. */
- int index = _mm_cmpistri (value, value, 0x3a);
- if (index < 16 - offset)
- return length;
- aligned += 16;
- }
- else
- aligned = s;
-
- while (1)
- {
- __m128i value = _mm_load_si128 ((__m128i *) aligned);
- int index = _mm_cmpistri (mask, value, 0x12);
- int cflag = _mm_cmpistrc (mask, value, 0x12);
- if (cflag)
- return (size_t) (aligned + index - s);
- aligned += 16;
- }
-}
diff --git a/sysdeps/x86_64/multiarch/strspn.S b/sysdeps/x86_64/multiarch/strspn.S
deleted file mode 100644
index adf7d9e533..0000000000
--- a/sysdeps/x86_64/multiarch/strspn.S
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Multiple versions of strspn
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2009-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <config.h>
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in libc. */
-#if IS_IN (libc)
- .text
-ENTRY(strspn)
- .type strspn, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- leaq __strspn_sse2(%rip), %rax
- HAS_CPU_FEATURE (SSE4_2)
- jz 2f
- leaq __strspn_sse42(%rip), %rax
-2: ret
-END(strspn)
-
-# undef ENTRY
-# define ENTRY(name) \
- .type __strspn_sse2, @function; \
- .globl __strspn_sse2; \
- .align 16; \
- __strspn_sse2: cfi_startproc; \
- CALL_MCOUNT
-# undef END
-# define END(name) \
- cfi_endproc; .size __strspn_sse2, .-__strspn_sse2
-#endif
-
-#include "../strspn.S"
diff --git a/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S
deleted file mode 100644
index 138979d10a..0000000000
--- a/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S
+++ /dev/null
@@ -1,374 +0,0 @@
-/* strstr with unaligned loads
- Copyright (C) 2009-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-ENTRY(__strstr_sse2_unaligned)
- movzbl (%rsi), %eax
- testb %al, %al
- je L(empty)
- movzbl 1(%rsi), %edx
- testb %dl, %dl
- je L(strchr)
- movd %eax, %xmm1
- movd %edx, %xmm2
- movq %rdi, %rax
- andl $4095, %eax
- punpcklbw %xmm1, %xmm1
- cmpq $4031, %rax
- punpcklbw %xmm2, %xmm2
- punpcklwd %xmm1, %xmm1
- punpcklwd %xmm2, %xmm2
- pshufd $0, %xmm1, %xmm1
- pshufd $0, %xmm2, %xmm2
- ja L(cross_page)
- movdqu (%rdi), %xmm3
- pxor %xmm5, %xmm5
- movdqu 1(%rdi), %xmm4
- movdqa %xmm3, %xmm6
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm2, %xmm4
- movdqu 16(%rdi), %xmm0
- pcmpeqb %xmm5, %xmm6
- pminub %xmm4, %xmm3
- movdqa %xmm3, %xmm4
- movdqu 17(%rdi), %xmm3
- pcmpeqb %xmm0, %xmm5
- pcmpeqb %xmm2, %xmm3
- por %xmm6, %xmm4
- pcmpeqb %xmm1, %xmm0
- pminub %xmm3, %xmm0
- por %xmm5, %xmm0
- pmovmskb %xmm4, %r8d
- pmovmskb %xmm0, %eax
- salq $16, %rax
- orq %rax, %r8
- je L(next_32_bytes)
-L(next_pair_index):
- bsf %r8, %rax
- addq %rdi, %rax
- cmpb $0, (%rax)
- je L(zero1)
- movzbl 2(%rsi), %edx
- testb %dl, %dl
- je L(found1)
- cmpb 2(%rax), %dl
- jne L(next_pair)
- xorl %edx, %edx
- jmp L(pair_loop_start)
-
- .p2align 4
-L(strchr):
- movzbl %al, %esi
- jmp __strchr_sse2
-
- .p2align 4
-L(pair_loop):
- addq $1, %rdx
- cmpb 2(%rax,%rdx), %cl
- jne L(next_pair)
-L(pair_loop_start):
- movzbl 3(%rsi,%rdx), %ecx
- testb %cl, %cl
- jne L(pair_loop)
-L(found1):
- ret
-L(zero1):
- xorl %eax, %eax
- ret
-
- .p2align 4
-L(next_pair):
- leaq -1(%r8), %rax
- andq %rax, %r8
- jne L(next_pair_index)
-
- .p2align 4
-L(next_32_bytes):
- movdqu 32(%rdi), %xmm3
- pxor %xmm5, %xmm5
- movdqu 33(%rdi), %xmm4
- movdqa %xmm3, %xmm6
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm2, %xmm4
- movdqu 48(%rdi), %xmm0
- pcmpeqb %xmm5, %xmm6
- pminub %xmm4, %xmm3
- movdqa %xmm3, %xmm4
- movdqu 49(%rdi), %xmm3
- pcmpeqb %xmm0, %xmm5
- pcmpeqb %xmm2, %xmm3
- por %xmm6, %xmm4
- pcmpeqb %xmm1, %xmm0
- pminub %xmm3, %xmm0
- por %xmm5, %xmm0
- pmovmskb %xmm4, %eax
- salq $32, %rax
- pmovmskb %xmm0, %r8d
- salq $48, %r8
- orq %rax, %r8
- je L(loop_header)
-L(next_pair2_index):
- bsfq %r8, %rax
- addq %rdi, %rax
- cmpb $0, (%rax)
- je L(zero2)
- movzbl 2(%rsi), %edx
- testb %dl, %dl
- je L(found2)
- cmpb 2(%rax), %dl
- jne L(next_pair2)
- xorl %edx, %edx
- jmp L(pair_loop2_start)
-
- .p2align 4
-L(pair_loop2):
- addq $1, %rdx
- cmpb 2(%rax,%rdx), %cl
- jne L(next_pair2)
-L(pair_loop2_start):
- movzbl 3(%rsi,%rdx), %ecx
- testb %cl, %cl
- jne L(pair_loop2)
-L(found2):
- ret
- L(zero2):
- xorl %eax, %eax
- ret
-L(empty):
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(next_pair2):
- leaq -1(%r8), %rax
- andq %rax, %r8
- jne L(next_pair2_index)
-L(loop_header):
- movq $-512, %r11
- movq %rdi, %r9
-
- pxor %xmm7, %xmm7
- andq $-64, %rdi
-
- .p2align 4
-L(loop):
- movdqa 64(%rdi), %xmm3
- movdqu 63(%rdi), %xmm6
- movdqa %xmm3, %xmm0
- pxor %xmm2, %xmm3
- pxor %xmm1, %xmm6
- movdqa 80(%rdi), %xmm10
- por %xmm3, %xmm6
- pminub %xmm10, %xmm0
- movdqu 79(%rdi), %xmm3
- pxor %xmm2, %xmm10
- pxor %xmm1, %xmm3
- movdqa 96(%rdi), %xmm9
- por %xmm10, %xmm3
- pminub %xmm9, %xmm0
- pxor %xmm2, %xmm9
- movdqa 112(%rdi), %xmm8
- addq $64, %rdi
- pminub %xmm6, %xmm3
- movdqu 31(%rdi), %xmm4
- pminub %xmm8, %xmm0
- pxor %xmm2, %xmm8
- pxor %xmm1, %xmm4
- por %xmm9, %xmm4
- pminub %xmm4, %xmm3
- movdqu 47(%rdi), %xmm5
- pxor %xmm1, %xmm5
- por %xmm8, %xmm5
- pminub %xmm5, %xmm3
- pminub %xmm3, %xmm0
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm0, %eax
- testl %eax, %eax
- je L(loop)
- pminub (%rdi), %xmm6
- pminub 32(%rdi),%xmm4
- pminub 48(%rdi),%xmm5
- pcmpeqb %xmm7, %xmm6
- pcmpeqb %xmm7, %xmm5
- pmovmskb %xmm6, %edx
- movdqa 16(%rdi), %xmm8
- pcmpeqb %xmm7, %xmm4
- movdqu 15(%rdi), %xmm0
- pmovmskb %xmm5, %r8d
- movdqa %xmm8, %xmm3
- pmovmskb %xmm4, %ecx
- pcmpeqb %xmm1,%xmm0
- pcmpeqb %xmm2,%xmm3
- salq $32, %rcx
- pcmpeqb %xmm7,%xmm8
- salq $48, %r8
- pminub %xmm0,%xmm3
- orq %rcx, %rdx
- por %xmm3,%xmm8
- orq %rdx, %r8
- pmovmskb %xmm8, %eax
- salq $16, %rax
- orq %rax, %r8
- je L(loop)
-L(next_pair_index3):
- bsfq %r8, %rcx
- addq %rdi, %rcx
- cmpb $0, (%rcx)
- je L(zero)
- xorl %eax, %eax
- movzbl 2(%rsi), %edx
- testb %dl, %dl
- je L(success3)
- cmpb 1(%rcx), %dl
- jne L(next_pair3)
- jmp L(pair_loop_start3)
-
- .p2align 4
-L(pair_loop3):
- addq $1, %rax
- cmpb 1(%rcx,%rax), %dl
- jne L(next_pair3)
-L(pair_loop_start3):
- movzbl 3(%rsi,%rax), %edx
- testb %dl, %dl
- jne L(pair_loop3)
-L(success3):
- lea -1(%rcx), %rax
- ret
-
- .p2align 4
-L(next_pair3):
- addq %rax, %r11
- movq %rdi, %rax
- subq %r9, %rax
- cmpq %r11, %rax
- jl L(switch_strstr)
- leaq -1(%r8), %rax
- andq %rax, %r8
- jne L(next_pair_index3)
- jmp L(loop)
-
- .p2align 4
-L(switch_strstr):
- movq %rdi, %rdi
- jmp __strstr_sse2
-
- .p2align 4
-L(cross_page):
-
- movq %rdi, %rax
- pxor %xmm0, %xmm0
- andq $-64, %rax
- movdqa (%rax), %xmm3
- movdqu -1(%rax), %xmm4
- movdqa %xmm3, %xmm8
- movdqa 16(%rax), %xmm5
- pcmpeqb %xmm1, %xmm4
- pcmpeqb %xmm0, %xmm8
- pcmpeqb %xmm2, %xmm3
- movdqa %xmm5, %xmm7
- pminub %xmm4, %xmm3
- movdqu 15(%rax), %xmm4
- pcmpeqb %xmm0, %xmm7
- por %xmm3, %xmm8
- movdqa %xmm5, %xmm3
- movdqa 32(%rax), %xmm5
- pcmpeqb %xmm1, %xmm4
- pcmpeqb %xmm2, %xmm3
- movdqa %xmm5, %xmm6
- pmovmskb %xmm8, %ecx
- pminub %xmm4, %xmm3
- movdqu 31(%rax), %xmm4
- por %xmm3, %xmm7
- movdqa %xmm5, %xmm3
- pcmpeqb %xmm0, %xmm6
- movdqa 48(%rax), %xmm5
- pcmpeqb %xmm1, %xmm4
- pmovmskb %xmm7, %r8d
- pcmpeqb %xmm2, %xmm3
- pcmpeqb %xmm5, %xmm0
- pminub %xmm4, %xmm3
- movdqu 47(%rax), %xmm4
- por %xmm3, %xmm6
- movdqa %xmm5, %xmm3
- salq $16, %r8
- pcmpeqb %xmm1, %xmm4
- pcmpeqb %xmm2, %xmm3
- pmovmskb %xmm6, %r10d
- pminub %xmm4, %xmm3
- por %xmm3, %xmm0
- salq $32, %r10
- orq %r10, %r8
- orq %rcx, %r8
- movl %edi, %ecx
- pmovmskb %xmm0, %edx
- subl %eax, %ecx
- salq $48, %rdx
- orq %rdx, %r8
- shrq %cl, %r8
- je L(loop_header)
-L(next_pair_index4):
- bsfq %r8, %rax
- addq %rdi, %rax
- cmpb $0, (%rax)
- je L(zero)
-
- cmpq %rax,%rdi
- je L(next_pair4)
-
- movzbl 2(%rsi), %edx
- testb %dl, %dl
- je L(found3)
- cmpb 1(%rax), %dl
- jne L(next_pair4)
- xorl %edx, %edx
- jmp L(pair_loop_start4)
-
- .p2align 4
-L(pair_loop4):
- addq $1, %rdx
- cmpb 1(%rax,%rdx), %cl
- jne L(next_pair4)
-L(pair_loop_start4):
- movzbl 3(%rsi,%rdx), %ecx
- testb %cl, %cl
- jne L(pair_loop4)
-L(found3):
- subq $1, %rax
- ret
-
- .p2align 4
-L(next_pair4):
- leaq -1(%r8), %rax
- andq %rax, %r8
- jne L(next_pair_index4)
- jmp L(loop_header)
-
- .p2align 4
-L(found):
- rep
- ret
-
- .p2align 4
-L(zero):
- xorl %eax, %eax
- ret
-
-
-END(__strstr_sse2_unaligned)
diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
deleted file mode 100644
index a7d181d797..0000000000
--- a/sysdeps/x86_64/multiarch/strstr.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Multiple versions of strstr.
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2012-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-/* Redefine strstr so that the compiler won't complain about the type
- mismatch with the IFUNC selector in strong_alias, below. */
-#undef strstr
-#define strstr __redirect_strstr
-#include <string.h>
-#undef strstr
-
-#define STRSTR __strstr_sse2
-#ifdef SHARED
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(name) \
- __hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2);
-#endif
-
-#include "string/strstr.c"
-
-extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
-extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
-
-#include "init-arch.h"
-
-/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
- ifunc symbol properly. */
-extern __typeof (__redirect_strstr) __libc_strstr;
-libc_ifunc (__libc_strstr,
- HAS_ARCH_FEATURE (Fast_Unaligned_Load)
- ? __strstr_sse2_unaligned
- : __strstr_sse2)
-
-#undef strstr
-strong_alias (__libc_strstr, strstr)
diff --git a/sysdeps/x86_64/multiarch/test-multiarch.c b/sysdeps/x86_64/multiarch/test-multiarch.c
deleted file mode 100644
index 597d64e1e8..0000000000
--- a/sysdeps/x86_64/multiarch/test-multiarch.c
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Test CPU feature data.
- This file is part of the GNU C Library.
- Copyright (C) 2012-2017 Free Software Foundation, Inc.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <cpu-features.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-static char *cpu_flags;
-
-/* Search for flags in /proc/cpuinfo and store line
- in cpu_flags. */
-void
-get_cpuinfo (void)
-{
- FILE *f;
- char *line = NULL;
- size_t len = 0;
- ssize_t read;
-
- f = fopen ("/proc/cpuinfo", "r");
- if (f == NULL)
- {
- printf ("cannot open /proc/cpuinfo\n");
- exit (1);
- }
-
- while ((read = getline (&line, &len, f)) != -1)
- {
- if (strncmp (line, "flags", 5) == 0)
- {
- cpu_flags = strdup (line);
- break;
- }
- }
- fclose (f);
- free (line);
-}
-
-int
-check_proc (const char *proc_name, int flag, const char *name)
-{
- int found = 0;
-
- printf ("Checking %s:\n", name);
- printf (" init-arch %d\n", flag);
- if (strstr (cpu_flags, proc_name) != NULL)
- found = 1;
- printf (" cpuinfo (%s) %d\n", proc_name, found);
-
- if (found != flag)
- printf (" *** failure ***\n");
-
- return (found != flag);
-}
-
-static int
-do_test (int argc, char **argv)
-{
- int fails;
-
- get_cpuinfo ();
- fails = check_proc ("avx", HAS_ARCH_FEATURE (AVX_Usable),
- "HAS_ARCH_FEATURE (AVX_Usable)");
- fails += check_proc ("fma4", HAS_ARCH_FEATURE (FMA4_Usable),
- "HAS_ARCH_FEATURE (FMA4_Usable)");
- fails += check_proc ("sse4_2", HAS_CPU_FEATURE (SSE4_2),
- "HAS_CPU_FEATURE (SSE4_2)");
- fails += check_proc ("sse4_1", HAS_CPU_FEATURE (SSE4_1)
- , "HAS_CPU_FEATURE (SSE4_1)");
- fails += check_proc ("ssse3", HAS_CPU_FEATURE (SSSE3),
- "HAS_CPU_FEATURE (SSSE3)");
- fails += check_proc ("popcnt", HAS_CPU_FEATURE (POPCOUNT),
- "HAS_CPU_FEATURE (POPCOUNT)");
-
- printf ("%d differences between /proc/cpuinfo and glibc code.\n", fails);
-
- return (fails != 0);
-}
-
-#include "../../../test-skeleton.c"
diff --git a/sysdeps/x86_64/multiarch/varshift.c b/sysdeps/x86_64/multiarch/varshift.c
deleted file mode 100644
index 1c3e34845d..0000000000
--- a/sysdeps/x86_64/multiarch/varshift.c
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Helper for variable shifts of SSE registers.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include "varshift.h"
-
-const int8_t ___m128i_shift_right[31] attribute_hidden =
- {
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
- };
diff --git a/sysdeps/x86_64/multiarch/varshift.h b/sysdeps/x86_64/multiarch/varshift.h
deleted file mode 100644
index 07bb76c4bf..0000000000
--- a/sysdeps/x86_64/multiarch/varshift.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Helper for variable shifts of SSE registers.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <stdint.h>
-#include <tmmintrin.h>
-
-extern const int8_t ___m128i_shift_right[31] attribute_hidden;
-
-static __inline__ __m128i
-__m128i_shift_right (__m128i value, unsigned long int offset)
-{
- return _mm_shuffle_epi8 (value,
- _mm_loadu_si128 ((__m128i *) (___m128i_shift_right
- + offset)));
-}
diff --git a/sysdeps/x86_64/multiarch/wcscpy-c.c b/sysdeps/x86_64/multiarch/wcscpy-c.c
deleted file mode 100644
index a51a83a9be..0000000000
--- a/sysdeps/x86_64/multiarch/wcscpy-c.c
+++ /dev/null
@@ -1,5 +0,0 @@
-#if IS_IN (libc)
-# define wcscpy __wcscpy_sse2
-#endif
-
-#include "wcsmbs/wcscpy.c"
diff --git a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
deleted file mode 100644
index 53857ce4f5..0000000000
--- a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
+++ /dev/null
@@ -1,552 +0,0 @@
-/* wcscpy with SSSE3
- Copyright (C) 2011-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-# include <sysdep.h>
-
- .section .text.ssse3,"ax",@progbits
-ENTRY (__wcscpy_ssse3)
-
- mov %rsi, %rcx
- mov %rdi, %rdx
-
- cmpl $0, (%rcx)
- jz L(Exit4)
- cmpl $0, 4(%rcx)
- jz L(Exit8)
- cmpl $0, 8(%rcx)
- jz L(Exit12)
- cmpl $0, 12(%rcx)
- jz L(Exit16)
-
- lea 16(%rcx), %rsi
- and $-16, %rsi
-
- pxor %xmm0, %xmm0
- mov (%rcx), %r9
- mov %r9, (%rdx)
-
- pcmpeqd (%rsi), %xmm0
- mov 8(%rcx), %r9
- mov %r9, 8(%rdx)
-
- pmovmskb %xmm0, %rax
- sub %rcx, %rsi
-
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- mov %rdx, %rax
- lea 16(%rdx), %rdx
- and $-16, %rdx
- sub %rdx, %rax
- sub %rax, %rcx
- mov %rcx, %rax
- and $0xf, %rax
- mov $0, %rsi
-
-/* case: rcx_offset == rdx_offset */
-
- jz L(Align16Both)
-
- cmp $4, %rax
- je L(Shl4)
- cmp $8, %rax
- je L(Shl8)
- jmp L(Shl12)
-
-L(Align16Both):
- movaps (%rcx), %xmm1
- movaps 16(%rcx), %xmm2
- movaps %xmm1, (%rdx)
- pcmpeqd %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm3
- movaps %xmm2, (%rdx, %rsi)
- pcmpeqd %xmm3, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm4
- movaps %xmm3, (%rdx, %rsi)
- pcmpeqd %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm1
- movaps %xmm4, (%rdx, %rsi)
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm2
- movaps %xmm1, (%rdx, %rsi)
- pcmpeqd %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm3
- movaps %xmm2, (%rdx, %rsi)
- pcmpeqd %xmm3, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps %xmm3, (%rdx, %rsi)
- mov %rcx, %rax
- lea 16(%rcx, %rsi), %rcx
- and $-0x40, %rcx
- sub %rcx, %rax
- sub %rax, %rdx
-
- mov $-0x40, %rsi
-
- .p2align 4
-L(Aligned64Loop):
- movaps (%rcx), %xmm2
- movaps %xmm2, %xmm4
- movaps 16(%rcx), %xmm5
- movaps 32(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 48(%rcx), %xmm7
- pminub %xmm5, %xmm2
- pminub %xmm7, %xmm3
- pminub %xmm2, %xmm3
- pcmpeqd %xmm0, %xmm3
- pmovmskb %xmm3, %rax
- lea 64(%rdx), %rdx
- lea 64(%rcx), %rcx
- test %rax, %rax
- jnz L(Aligned64Leave)
- movaps %xmm4, -64(%rdx)
- movaps %xmm5, -48(%rdx)
- movaps %xmm6, -32(%rdx)
- movaps %xmm7, -16(%rdx)
- jmp L(Aligned64Loop)
-
-L(Aligned64Leave):
- pcmpeqd %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqd %xmm5, %xmm0
-
- pmovmskb %xmm0, %rax
- movaps %xmm4, -64(%rdx)
- test %rax, %rax
- lea 16(%rsi), %rsi
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqd %xmm6, %xmm0
-
- pmovmskb %xmm0, %rax
- movaps %xmm5, -48(%rdx)
- test %rax, %rax
- lea 16(%rsi), %rsi
- jnz L(CopyFrom1To16Bytes)
-
- movaps %xmm6, -32(%rdx)
- pcmpeqd %xmm7, %xmm0
-
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- mov $-0x40, %rsi
- movaps %xmm7, -16(%rdx)
- jmp L(Aligned64Loop)
-
- .p2align 4
-L(Shl4):
- movaps -4(%rcx), %xmm1
- movaps 12(%rcx), %xmm2
-L(Shl4Start):
- pcmpeqd %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqd %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqd %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqd %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 28(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -12(%rcx), %rcx
- sub %rax, %rdx
-
- movaps -4(%rcx), %xmm1
-
- .p2align 4
-L(Shl4LoopStart):
- movaps 12(%rcx), %xmm2
- movaps 28(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 44(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 60(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqd %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $4, %xmm4, %xmm5
- test %rax, %rax
- palignr $4, %xmm3, %xmm4
- jnz L(Shl4Start)
-
- palignr $4, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $4, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl4LoopStart)
-
-L(Shl4LoopExit):
- movdqu -4(%rcx), %xmm1
- mov $12, %rsi
- movdqu %xmm1, -4(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl8):
- movaps -8(%rcx), %xmm1
- movaps 8(%rcx), %xmm2
-L(Shl8Start):
- pcmpeqd %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
-
- pcmpeqd %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
-
- pcmpeqd %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
-
- pcmpeqd %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 24(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -8(%rcx), %rcx
- sub %rax, %rdx
-
- movaps -8(%rcx), %xmm1
-
- .p2align 4
-L(Shl8LoopStart):
- movaps 8(%rcx), %xmm2
- movaps 24(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 40(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 56(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqd %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $8, %xmm4, %xmm5
- test %rax, %rax
- palignr $8, %xmm3, %xmm4
- jnz L(Shl8Start)
-
- palignr $8, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $8, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl8LoopStart)
-
-L(Shl8LoopExit):
- mov (%rcx), %r9
- mov $8, %rsi
- mov %r9, (%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl12):
- movaps -12(%rcx), %xmm1
- movaps 4(%rcx), %xmm2
-L(Shl12Start):
- pcmpeqd %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
-
- pcmpeqd %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
-
- pcmpeqd %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
-
- pcmpeqd %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 20(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -4(%rcx), %rcx
- sub %rax, %rdx
-
- movaps -12(%rcx), %xmm1
-
- .p2align 4
-L(Shl12LoopStart):
- movaps 4(%rcx), %xmm2
- movaps 20(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 36(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 52(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqd %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $12, %xmm4, %xmm5
- test %rax, %rax
- palignr $12, %xmm3, %xmm4
- jnz L(Shl12Start)
- palignr $12, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $12, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl12LoopStart)
-
-L(Shl12LoopExit):
- mov (%rcx), %r9d
- mov $4, %rsi
- mov %r9d, (%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(CopyFrom1To16Bytes):
- add %rsi, %rdx
- add %rsi, %rcx
-
- test %al, %al
- jz L(ExitHigh)
- test $0x01, %al
- jnz L(Exit4)
-
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(ExitHigh):
- test $0x01, %ah
- jnz L(Exit12)
-
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 8(%rcx), %rax
- mov %rax, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(Exit4):
- movl (%rcx), %eax
- movl %eax, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(Exit8):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(Exit12):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 8(%rcx), %eax
- mov %eax, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(Exit16):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 8(%rcx), %rax
- mov %rax, 8(%rdx)
- mov %rdi, %rax
- ret
-
-END(__wcscpy_ssse3)
-#endif
diff --git a/sysdeps/x86_64/multiarch/wcscpy.S b/sysdeps/x86_64/multiarch/wcscpy.S
deleted file mode 100644
index 9150ab6d18..0000000000
--- a/sysdeps/x86_64/multiarch/wcscpy.S
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Multiple versions of wcscpy
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2011-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in libc. */
-#if IS_IN (libc)
-
- .text
-ENTRY(wcscpy)
- .type wcscpy, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- HAS_CPU_FEATURE (SSSE3)
- jnz 2f
- leaq __wcscpy_sse2(%rip), %rax
- ret
-
-2: leaq __wcscpy_ssse3(%rip), %rax
- ret
-
-END(wcscpy)
-#endif
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-c.c b/sysdeps/x86_64/multiarch/wcsnlen-c.c
deleted file mode 100644
index e1ec7cfbb5..0000000000
--- a/sysdeps/x86_64/multiarch/wcsnlen-c.c
+++ /dev/null
@@ -1,9 +0,0 @@
-#if IS_IN (libc)
-# include <wchar.h>
-
-# define WCSNLEN __wcsnlen_sse2
-
-extern __typeof (wcsnlen) __wcsnlen_sse2;
-#endif
-
-#include "wcsmbs/wcsnlen.c"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
deleted file mode 100644
index a8cab0cb00..0000000000
--- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
+++ /dev/null
@@ -1,5 +0,0 @@
-#define AS_WCSLEN
-#define AS_STRNLEN
-#define strlen __wcsnlen_sse4_1
-
-#include "../strlen.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
deleted file mode 100644
index 304f62eec3..0000000000
--- a/sysdeps/x86_64/multiarch/wcsnlen.c
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Multiple versions of wcsnlen.
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-/* Define multiple versions only for the definition in libc. */
-#if IS_IN (libc)
-# define __wcsnlen __redirect_wcsnlen
-# include <wchar.h>
-# undef __wcsnlen
-
-# define SYMBOL_NAME wcsnlen
-# include <init-arch.h>
-
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
-
-static inline void *
-IFUNC_SELECTOR (void)
-{
- const struct cpu_features* cpu_features = __get_cpu_features ();
-
- if (CPU_FEATURES_CPU_P (cpu_features, SSE4_1))
- return OPTIMIZE (sse4_1);
-
- return OPTIMIZE (sse2);
-}
-
-libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ());
-weak_alias (__wcsnlen, wcsnlen);
-#endif
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe.S
deleted file mode 100644
index bfa1a16a35..0000000000
--- a/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define MEMCMP __wmemcmp_avx2_movbe
-#define USE_AS_WMEMCMP 1
-
-#include "memcmp-avx2-movbe.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-c.c b/sysdeps/x86_64/multiarch/wmemcmp-c.c
deleted file mode 100644
index 46b6715e18..0000000000
--- a/sysdeps/x86_64/multiarch/wmemcmp-c.c
+++ /dev/null
@@ -1,9 +0,0 @@
-#if IS_IN (libc)
-# include <wchar.h>
-
-# define WMEMCMP __wmemcmp_sse2
-
-extern __typeof (wmemcmp) __wmemcmp_sse2;
-#endif
-
-#include "wcsmbs/wmemcmp.c"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-sse4.S b/sysdeps/x86_64/multiarch/wmemcmp-sse4.S
deleted file mode 100644
index b07973a4f6..0000000000
--- a/sysdeps/x86_64/multiarch/wmemcmp-sse4.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_WMEMCMP 1
-#define MEMCMP __wmemcmp_sse4_1
-
-#include "memcmp-sse4.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
deleted file mode 100644
index a41ef95fc1..0000000000
--- a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_WMEMCMP 1
-#define MEMCMP __wmemcmp_ssse3
-
-#include "memcmp-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S
deleted file mode 100644
index 94b25a214c..0000000000
--- a/sysdeps/x86_64/multiarch/wmemcmp.S
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Multiple versions of wmemcmp
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2011-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in libc. */
-#if IS_IN (libc)
- .text
-ENTRY(wmemcmp)
- .type wmemcmp, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
- jnz 1f
- HAS_ARCH_FEATURE (AVX2_Usable)
- jz 1f
- HAS_CPU_FEATURE (MOVBE)
- jz 1f
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz 1f
- leaq __wmemcmp_avx2_movbe(%rip), %rax
- ret
-
-1: HAS_CPU_FEATURE (SSSE3)
- jnz 2f
- leaq __wmemcmp_sse2(%rip), %rax
- ret
-
-2: HAS_CPU_FEATURE (SSE4_1)
- jz 3f
- leaq __wmemcmp_sse4_1(%rip), %rax
- ret
-
-3: leaq __wmemcmp_ssse3(%rip), %rax
- ret
-
-END(wmemcmp)
-#endif
diff --git a/sysdeps/x86_64/multiarch/wmemset.c b/sysdeps/x86_64/multiarch/wmemset.c
deleted file mode 100644
index dd35be6e49..0000000000
--- a/sysdeps/x86_64/multiarch/wmemset.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Multiple versions of wmemset.
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-/* Define multiple versions only for the definition in libc. */
-#if IS_IN (libc)
-# define wmemset __redirect_wmemset
-# define __wmemset __redirect___wmemset
-# include <wchar.h>
-# undef wmemset
-# undef __wmemset
-
-# define SYMBOL_NAME wmemset
-# include "ifunc-wmemset.h"
-
-libc_ifunc_redirected (__redirect_wmemset, __wmemset, IFUNC_SELECTOR ());
-weak_alias (__wmemset, wmemset)
-#endif
diff --git a/sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S b/sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S
deleted file mode 100644
index 0a537fe272..0000000000
--- a/sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Non-shared version of wmemset_chk for x86-64.
- Copyright (C) 2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc) && !defined SHARED
-# include "../wmemset_chk.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/wmemset_chk.c b/sysdeps/x86_64/multiarch/wmemset_chk.c
deleted file mode 100644
index d3ded5595b..0000000000
--- a/sysdeps/x86_64/multiarch/wmemset_chk.c
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Multiple versions of wmemset_chk.
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-/* Define multiple versions only for the definition in libc.so. */
-#if IS_IN (libc) && defined SHARED
-# define __wmemset_chk __redirect_wmemset_chk
-# include <wchar.h>
-# undef __wmemset_chk
-
-# define SYMBOL_NAME wmemset_chk
-# include "ifunc-wmemset.h"
-
-libc_ifunc_redirected (__redirect_wmemset_chk, __wmemset_chk,
- IFUNC_SELECTOR ());
-#endif