aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/x86_64/multiarch
diff options
context:
space:
mode:
authorZack Weinberg <zackw@panix.com>2017-06-08 15:39:03 -0400
committerZack Weinberg <zackw@panix.com>2017-06-08 15:39:03 -0400
commit5046dbb4a7eba5eccfd258f92f4735c9ffc8d069 (patch)
tree4470480d904b65cf14ca524f96f79eca818c3eaf /sysdeps/x86_64/multiarch
parent199fc19d3aaaf57944ef036e15904febe877fc93 (diff)
downloadglibc-zack/build-layout-experiment.tar
glibc-zack/build-layout-experiment.tar.gz
glibc-zack/build-layout-experiment.tar.bz2
glibc-zack/build-layout-experiment.zip
Prepare for radical source tree reorganization.zack/build-layout-experiment
All top-level files and directories are moved into a temporary storage directory, REORG.TODO, except for files that will certainly still exist in their current form at top level when we're done (COPYING, COPYING.LIB, LICENSES, NEWS, README), all old ChangeLog files (which are moved to the new directory OldChangeLogs, instead), and the generated file INSTALL (which is just deleted; in the new order, there will be no generated files checked into version control).
Diffstat (limited to 'sysdeps/x86_64/multiarch')
-rw-r--r--sysdeps/x86_64/multiarch/Makefile42
-rw-r--r--sysdeps/x86_64/multiarch/bcopy.S7
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-impl-list.c460
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-wmemset.h42
-rw-r--r--sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S425
-rw-r--r--sysdeps/x86_64/multiarch/memcmp-sse4.S1776
-rw-r--r--sysdeps/x86_64/multiarch/memcmp-ssse3.S1990
-rw-r--r--sysdeps/x86_64/multiarch/memcmp.S78
-rw-r--r--sysdeps/x86_64/multiarch/memcpy-ssse3-back.S3180
-rw-r--r--sysdeps/x86_64/multiarch/memcpy-ssse3.S3150
-rw-r--r--sysdeps/x86_64/multiarch/memcpy.S75
-rw-r--r--sysdeps/x86_64/multiarch/memcpy_chk.S72
-rw-r--r--sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S12
-rw-r--r--sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S420
-rw-r--r--sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S12
-rw-r--r--sysdeps/x86_64/multiarch/memmove-ssse3-back.S4
-rw-r--r--sysdeps/x86_64/multiarch/memmove-ssse3.S4
-rw-r--r--sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S553
-rw-r--r--sysdeps/x86_64/multiarch/memmove.S101
-rw-r--r--sysdeps/x86_64/multiarch/memmove_chk.S71
-rw-r--r--sysdeps/x86_64/multiarch/mempcpy.S73
-rw-r--r--sysdeps/x86_64/multiarch/mempcpy_chk.S72
-rw-r--r--sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S22
-rw-r--r--sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S194
-rw-r--r--sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S24
-rw-r--r--sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S263
-rw-r--r--sysdeps/x86_64/multiarch/memset.S82
-rw-r--r--sysdeps/x86_64/multiarch/memset_chk.S61
-rw-r--r--sysdeps/x86_64/multiarch/sched_cpucount.c36
-rw-r--r--sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S3
-rw-r--r--sysdeps/x86_64/multiarch/stpcpy-ssse3.S3
-rw-r--r--sysdeps/x86_64/multiarch/stpcpy.S9
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy-c.c8
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S4
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy-ssse3.S4
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy.S8
-rw-r--r--sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S6
-rw-r--r--sysdeps/x86_64/multiarch/strcasecmp_l.S8
-rw-r--r--sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S279
-rw-r--r--sysdeps/x86_64/multiarch/strcat-ssse3.S867
-rw-r--r--sysdeps/x86_64/multiarch/strcat.S85
-rw-r--r--sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S280
-rw-r--r--sysdeps/x86_64/multiarch/strchr.S57
-rw-r--r--sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S213
-rw-r--r--sysdeps/x86_64/multiarch/strcmp-sse42.S1792
-rw-r--r--sysdeps/x86_64/multiarch/strcmp-ssse3.S5
-rw-r--r--sysdeps/x86_64/multiarch/strcmp.S209
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S1889
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-ssse3.S3551
-rw-r--r--sysdeps/x86_64/multiarch/strcpy.S99
-rw-r--r--sysdeps/x86_64/multiarch/strcspn-c.c173
-rw-r--r--sysdeps/x86_64/multiarch/strcspn.S69
-rw-r--r--sysdeps/x86_64/multiarch/strncase_l-ssse3.S6
-rw-r--r--sysdeps/x86_64/multiarch/strncase_l.S8
-rw-r--r--sysdeps/x86_64/multiarch/strncat-c.c8
-rw-r--r--sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S3
-rw-r--r--sysdeps/x86_64/multiarch/strncat-ssse3.S3
-rw-r--r--sysdeps/x86_64/multiarch/strncat.S5
-rw-r--r--sysdeps/x86_64/multiarch/strncmp-ssse3.S6
-rw-r--r--sysdeps/x86_64/multiarch/strncmp.S5
-rw-r--r--sysdeps/x86_64/multiarch/strncpy-c.c8
-rw-r--r--sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S3
-rw-r--r--sysdeps/x86_64/multiarch/strncpy-ssse3.S3
-rw-r--r--sysdeps/x86_64/multiarch/strncpy.S5
-rw-r--r--sysdeps/x86_64/multiarch/strpbrk-c.c8
-rw-r--r--sysdeps/x86_64/multiarch/strpbrk.S5
-rw-r--r--sysdeps/x86_64/multiarch/strspn-c.c145
-rw-r--r--sysdeps/x86_64/multiarch/strspn.S50
-rw-r--r--sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S374
-rw-r--r--sysdeps/x86_64/multiarch/strstr.c50
-rw-r--r--sysdeps/x86_64/multiarch/test-multiarch.c96
-rw-r--r--sysdeps/x86_64/multiarch/varshift.c25
-rw-r--r--sysdeps/x86_64/multiarch/varshift.h30
-rw-r--r--sysdeps/x86_64/multiarch/wcscpy-c.c5
-rw-r--r--sysdeps/x86_64/multiarch/wcscpy-ssse3.S552
-rw-r--r--sysdeps/x86_64/multiarch/wcscpy.S40
-rw-r--r--sysdeps/x86_64/multiarch/wcsnlen-c.c9
-rw-r--r--sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S5
-rw-r--r--sysdeps/x86_64/multiarch/wcsnlen.c45
-rw-r--r--sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe.S4
-rw-r--r--sysdeps/x86_64/multiarch/wmemcmp-c.c9
-rw-r--r--sysdeps/x86_64/multiarch/wmemcmp-sse4.S4
-rw-r--r--sysdeps/x86_64/multiarch/wmemcmp-ssse3.S4
-rw-r--r--sysdeps/x86_64/multiarch/wmemcmp.S55
-rw-r--r--sysdeps/x86_64/multiarch/wmemset.c33
-rw-r--r--sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S21
-rw-r--r--sysdeps/x86_64/multiarch/wmemset_chk.c31
87 files changed, 0 insertions, 24585 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
deleted file mode 100644
index 310a3a4b72..0000000000
--- a/sysdeps/x86_64/multiarch/Makefile
+++ /dev/null
@@ -1,42 +0,0 @@
-ifeq ($(subdir),csu)
-tests += test-multiarch
-endif
-
-ifeq ($(subdir),string)
-
-sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
- strcmp-sse2-unaligned strncmp-ssse3 \
- memcmp-avx2-movbe \
- memcmp-sse4 memcpy-ssse3 \
- memmove-ssse3 \
- memcpy-ssse3-back \
- memmove-ssse3-back \
- memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
- strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
- strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
- strcpy-sse2-unaligned strncpy-sse2-unaligned \
- stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
- strcat-sse2-unaligned strncat-sse2-unaligned \
- strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
- strcspn-c strpbrk-c strspn-c varshift \
- memset-avx512-no-vzeroupper \
- memmove-avx-unaligned-erms \
- memmove-avx512-unaligned-erms \
- memset-avx2-unaligned-erms \
- memset-avx512-unaligned-erms
-CFLAGS-varshift.c += -msse4
-CFLAGS-strcspn-c.c += -msse4
-CFLAGS-strpbrk-c.c += -msse4
-CFLAGS-strspn-c.c += -msse4
-endif
-
-ifeq ($(subdir),wcsmbs)
-sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
- wmemcmp-avx2-movbe \
- wcscpy-ssse3 wcscpy-c \
- wcsnlen-sse4_1 wcsnlen-c
-endif
-
-ifeq ($(subdir),debug)
-sysdep_routines += wmemset_chk-nonshared
-endif
diff --git a/sysdeps/x86_64/multiarch/bcopy.S b/sysdeps/x86_64/multiarch/bcopy.S
deleted file mode 100644
index 639f02bde3..0000000000
--- a/sysdeps/x86_64/multiarch/bcopy.S
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <sysdep.h>
-
- .text
-ENTRY(bcopy)
- xchg %rdi, %rsi
- jmp __libc_memmove /* Branch to IFUNC memmove. */
-END(bcopy)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
deleted file mode 100644
index 5627183aca..0000000000
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ /dev/null
@@ -1,460 +0,0 @@
-/* Enumerate available IFUNC implementations of a function. x86-64 version.
- Copyright (C) 2012-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <assert.h>
-#include <string.h>
-#include <wchar.h>
-#include <ifunc-impl-list.h>
-#include <sysdep.h>
-#include "init-arch.h"
-
-/* Maximum number of IFUNC implementations. */
-#define MAX_IFUNC 5
-
-/* Fill ARRAY of MAX elements with IFUNC implementations for function
- NAME supported on target machine and return the number of valid
- entries. */
-
-size_t
-__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
- size_t max)
-{
- assert (max >= MAX_IFUNC);
-
- size_t i = 0;
-
- /* Support sysdeps/x86_64/multiarch/memcmp.S. */
- IFUNC_IMPL (i, name, memcmp,
- IFUNC_IMPL_ADD (array, i, memcmp,
- (HAS_ARCH_FEATURE (AVX2_Usable)
- && HAS_CPU_FEATURE (MOVBE)),
- __memcmp_avx2_movbe)
- IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_1),
- __memcmp_sse4_1)
- IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3),
- __memcmp_ssse3)
- IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
-
- /* Support sysdeps/x86_64/multiarch/memmove_chk.c. */
- IFUNC_IMPL (i, name, __memmove_chk,
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memmove_chk_avx512_no_vzeroupper)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memmove_chk_avx512_unaligned)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memmove_chk_avx512_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
- HAS_ARCH_FEATURE (AVX_Usable),
- __memmove_chk_avx_unaligned)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
- HAS_ARCH_FEATURE (AVX_Usable),
- __memmove_chk_avx_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
- HAS_CPU_FEATURE (SSSE3),
- __memmove_chk_ssse3_back)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
- HAS_CPU_FEATURE (SSSE3),
- __memmove_chk_ssse3)
- IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
- __memmove_chk_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
- __memmove_chk_sse2_unaligned_erms))
-
- /* Support sysdeps/x86_64/multiarch/memmove.S. */
- IFUNC_IMPL (i, name, memmove,
- IFUNC_IMPL_ADD (array, i, memmove,
- HAS_ARCH_FEATURE (AVX_Usable),
- __memmove_avx_unaligned)
- IFUNC_IMPL_ADD (array, i, memmove,
- HAS_ARCH_FEATURE (AVX_Usable),
- __memmove_avx_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memmove,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memmove_avx512_no_vzeroupper)
- IFUNC_IMPL_ADD (array, i, memmove,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memmove_avx512_unaligned)
- IFUNC_IMPL_ADD (array, i, memmove,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memmove_avx512_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
- __memmove_ssse3_back)
- IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
- __memmove_ssse3)
- IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
- IFUNC_IMPL_ADD (array, i, memmove, 1,
- __memmove_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, memmove, 1,
- __memmove_sse2_unaligned_erms))
-
- /* Support sysdeps/x86_64/multiarch/memset_chk.S. */
- IFUNC_IMPL (i, name, __memset_chk,
- IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
- __memset_chk_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
- __memset_chk_sse2_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memset_chk,
- HAS_ARCH_FEATURE (AVX2_Usable),
- __memset_chk_avx2_unaligned)
- IFUNC_IMPL_ADD (array, i, __memset_chk,
- HAS_ARCH_FEATURE (AVX2_Usable),
- __memset_chk_avx2_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memset_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memset_chk_avx512_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memset_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memset_chk_avx512_unaligned)
- IFUNC_IMPL_ADD (array, i, __memset_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memset_chk_avx512_no_vzeroupper)
- )
-
- /* Support sysdeps/x86_64/multiarch/memset.S. */
- IFUNC_IMPL (i, name, memset,
- IFUNC_IMPL_ADD (array, i, memset, 1,
- __memset_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, memset, 1,
- __memset_sse2_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms)
- IFUNC_IMPL_ADD (array, i, memset,
- HAS_ARCH_FEATURE (AVX2_Usable),
- __memset_avx2_unaligned)
- IFUNC_IMPL_ADD (array, i, memset,
- HAS_ARCH_FEATURE (AVX2_Usable),
- __memset_avx2_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memset,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memset_avx512_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memset,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memset_avx512_unaligned)
- IFUNC_IMPL_ADD (array, i, memset,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memset_avx512_no_vzeroupper)
- )
-
- /* Support sysdeps/x86_64/multiarch/stpncpy.S. */
- IFUNC_IMPL (i, name, stpncpy,
- IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
- __stpncpy_ssse3)
- IFUNC_IMPL_ADD (array, i, stpncpy, 1,
- __stpncpy_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2))
-
- /* Support sysdeps/x86_64/multiarch/stpcpy.S. */
- IFUNC_IMPL (i, name, stpcpy,
- IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3),
- __stpcpy_ssse3)
- IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2))
-
- /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S. */
- IFUNC_IMPL (i, name, strcasecmp,
- IFUNC_IMPL_ADD (array, i, strcasecmp,
- HAS_ARCH_FEATURE (AVX_Usable),
- __strcasecmp_avx)
- IFUNC_IMPL_ADD (array, i, strcasecmp,
- HAS_CPU_FEATURE (SSE4_2),
- __strcasecmp_sse42)
- IFUNC_IMPL_ADD (array, i, strcasecmp,
- HAS_CPU_FEATURE (SSSE3),
- __strcasecmp_ssse3)
- IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
-
- /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S. */
- IFUNC_IMPL (i, name, strcasecmp_l,
- IFUNC_IMPL_ADD (array, i, strcasecmp_l,
- HAS_ARCH_FEATURE (AVX_Usable),
- __strcasecmp_l_avx)
- IFUNC_IMPL_ADD (array, i, strcasecmp_l,
- HAS_CPU_FEATURE (SSE4_2),
- __strcasecmp_l_sse42)
- IFUNC_IMPL_ADD (array, i, strcasecmp_l,
- HAS_CPU_FEATURE (SSSE3),
- __strcasecmp_l_ssse3)
- IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
- __strcasecmp_l_sse2))
-
- /* Support sysdeps/x86_64/multiarch/strcat.S. */
- IFUNC_IMPL (i, name, strcat,
- IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3),
- __strcat_ssse3)
- IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2))
-
- /* Support sysdeps/x86_64/multiarch/strchr.S. */
- IFUNC_IMPL (i, name, strchr,
- IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
- IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
-
- /* Support sysdeps/x86_64/multiarch/strcmp.S. */
- IFUNC_IMPL (i, name, strcmp,
- IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2),
- __strcmp_sse42)
- IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSSE3),
- __strcmp_ssse3)
- IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2))
-
- /* Support sysdeps/x86_64/multiarch/strcpy.S. */
- IFUNC_IMPL (i, name, strcpy,
- IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3),
- __strcpy_ssse3)
- IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
-
- /* Support sysdeps/x86_64/multiarch/strcspn.S. */
- IFUNC_IMPL (i, name, strcspn,
- IFUNC_IMPL_ADD (array, i, strcspn, HAS_CPU_FEATURE (SSE4_2),
- __strcspn_sse42)
- IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_sse2))
-
- /* Support sysdeps/x86_64/multiarch/strncase_l.S. */
- IFUNC_IMPL (i, name, strncasecmp,
- IFUNC_IMPL_ADD (array, i, strncasecmp,
- HAS_ARCH_FEATURE (AVX_Usable),
- __strncasecmp_avx)
- IFUNC_IMPL_ADD (array, i, strncasecmp,
- HAS_CPU_FEATURE (SSE4_2),
- __strncasecmp_sse42)
- IFUNC_IMPL_ADD (array, i, strncasecmp,
- HAS_CPU_FEATURE (SSSE3),
- __strncasecmp_ssse3)
- IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
- __strncasecmp_sse2))
-
- /* Support sysdeps/x86_64/multiarch/strncase_l.S. */
- IFUNC_IMPL (i, name, strncasecmp_l,
- IFUNC_IMPL_ADD (array, i, strncasecmp_l,
- HAS_ARCH_FEATURE (AVX_Usable),
- __strncasecmp_l_avx)
- IFUNC_IMPL_ADD (array, i, strncasecmp_l,
- HAS_CPU_FEATURE (SSE4_2),
- __strncasecmp_l_sse42)
- IFUNC_IMPL_ADD (array, i, strncasecmp_l,
- HAS_CPU_FEATURE (SSSE3),
- __strncasecmp_l_ssse3)
- IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
- __strncasecmp_l_sse2))
-
- /* Support sysdeps/x86_64/multiarch/strncat.S. */
- IFUNC_IMPL (i, name, strncat,
- IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3),
- __strncat_ssse3)
- IFUNC_IMPL_ADD (array, i, strncat, 1,
- __strncat_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2))
-
- /* Support sysdeps/x86_64/multiarch/strncpy.S. */
- IFUNC_IMPL (i, name, strncpy,
- IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3),
- __strncpy_ssse3)
- IFUNC_IMPL_ADD (array, i, strncpy, 1,
- __strncpy_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
-
- /* Support sysdeps/x86_64/multiarch/strpbrk.S. */
- IFUNC_IMPL (i, name, strpbrk,
- IFUNC_IMPL_ADD (array, i, strpbrk, HAS_CPU_FEATURE (SSE4_2),
- __strpbrk_sse42)
- IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_sse2))
-
-
- /* Support sysdeps/x86_64/multiarch/strspn.S. */
- IFUNC_IMPL (i, name, strspn,
- IFUNC_IMPL_ADD (array, i, strspn, HAS_CPU_FEATURE (SSE4_2),
- __strspn_sse42)
- IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_sse2))
-
- /* Support sysdeps/x86_64/multiarch/strstr.c. */
- IFUNC_IMPL (i, name, strstr,
- IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
-
- /* Support sysdeps/x86_64/multiarch/wcscpy.S. */
- IFUNC_IMPL (i, name, wcscpy,
- IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3),
- __wcscpy_ssse3)
- IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_sse2))
-
- /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */
- IFUNC_IMPL (i, name, wcsnlen,
- IFUNC_IMPL_ADD (array, i, wcsnlen,
- HAS_CPU_FEATURE (SSE4_1),
- __wcsnlen_sse4_1)
- IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_sse2))
-
- /* Support sysdeps/x86_64/multiarch/wmemcmp.S. */
- IFUNC_IMPL (i, name, wmemcmp,
- IFUNC_IMPL_ADD (array, i, wmemcmp,
- (HAS_ARCH_FEATURE (AVX2_Usable)
- && HAS_CPU_FEATURE (MOVBE)),
- __wmemcmp_avx2_movbe)
- IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_1),
- __wmemcmp_sse4_1)
- IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3),
- __wmemcmp_ssse3)
- IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
-
- /* Support sysdeps/x86_64/multiarch/wmemset.c. */
- IFUNC_IMPL (i, name, wmemset,
- IFUNC_IMPL_ADD (array, i, wmemset, 1,
- __wmemset_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, wmemset,
- HAS_ARCH_FEATURE (AVX2_Usable),
- __wmemset_avx2_unaligned)
- IFUNC_IMPL_ADD (array, i, wmemset,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __wmemset_avx512_unaligned))
-
-#ifdef SHARED
- /* Support sysdeps/x86_64/multiarch/memcpy_chk.S. */
- IFUNC_IMPL (i, name, __memcpy_chk,
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memcpy_chk_avx512_no_vzeroupper)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memcpy_chk_avx512_unaligned)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memcpy_chk_avx512_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- HAS_ARCH_FEATURE (AVX_Usable),
- __memcpy_chk_avx_unaligned)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- HAS_ARCH_FEATURE (AVX_Usable),
- __memcpy_chk_avx_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- HAS_CPU_FEATURE (SSSE3),
- __memcpy_chk_ssse3_back)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- HAS_CPU_FEATURE (SSSE3),
- __memcpy_chk_ssse3)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
- __memcpy_chk_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
- __memcpy_chk_sse2_unaligned_erms))
-
- /* Support sysdeps/x86_64/multiarch/memcpy.S. */
- IFUNC_IMPL (i, name, memcpy,
- IFUNC_IMPL_ADD (array, i, memcpy,
- HAS_ARCH_FEATURE (AVX_Usable),
- __memcpy_avx_unaligned)
- IFUNC_IMPL_ADD (array, i, memcpy,
- HAS_ARCH_FEATURE (AVX_Usable),
- __memcpy_avx_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
- __memcpy_ssse3_back)
- IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
- __memcpy_ssse3)
- IFUNC_IMPL_ADD (array, i, memcpy,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memcpy_avx512_no_vzeroupper)
- IFUNC_IMPL_ADD (array, i, memcpy,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memcpy_avx512_unaligned)
- IFUNC_IMPL_ADD (array, i, memcpy,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __memcpy_avx512_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, memcpy, 1,
- __memcpy_sse2_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms))
-
- /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S. */
- IFUNC_IMPL (i, name, __mempcpy_chk,
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __mempcpy_chk_avx512_no_vzeroupper)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __mempcpy_chk_avx512_unaligned)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __mempcpy_chk_avx512_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- HAS_ARCH_FEATURE (AVX_Usable),
- __mempcpy_chk_avx_unaligned)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- HAS_ARCH_FEATURE (AVX_Usable),
- __mempcpy_chk_avx_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- HAS_CPU_FEATURE (SSSE3),
- __mempcpy_chk_ssse3_back)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- HAS_CPU_FEATURE (SSSE3),
- __mempcpy_chk_ssse3)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
- __mempcpy_chk_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
- __mempcpy_chk_sse2_unaligned_erms))
-
- /* Support sysdeps/x86_64/multiarch/mempcpy.S. */
- IFUNC_IMPL (i, name, mempcpy,
- IFUNC_IMPL_ADD (array, i, mempcpy,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __mempcpy_avx512_no_vzeroupper)
- IFUNC_IMPL_ADD (array, i, mempcpy,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __mempcpy_avx512_unaligned)
- IFUNC_IMPL_ADD (array, i, mempcpy,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __mempcpy_avx512_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, mempcpy,
- HAS_ARCH_FEATURE (AVX_Usable),
- __mempcpy_avx_unaligned)
- IFUNC_IMPL_ADD (array, i, mempcpy,
- HAS_ARCH_FEATURE (AVX_Usable),
- __mempcpy_avx_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
- __mempcpy_ssse3_back)
- IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
- __mempcpy_ssse3)
- IFUNC_IMPL_ADD (array, i, mempcpy, 1,
- __mempcpy_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, mempcpy, 1,
- __mempcpy_sse2_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms))
-
- /* Support sysdeps/x86_64/multiarch/strncmp.S. */
- IFUNC_IMPL (i, name, strncmp,
- IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2),
- __strncmp_sse42)
- IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3),
- __strncmp_ssse3)
- IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
-
- /* Support sysdeps/x86_64/multiarch/wmemset_chk.c. */
- IFUNC_IMPL (i, name, __wmemset_chk,
- IFUNC_IMPL_ADD (array, i, __wmemset_chk, 1,
- __wmemset_chk_sse2_unaligned)
- IFUNC_IMPL_ADD (array, i, __wmemset_chk,
- HAS_ARCH_FEATURE (AVX2_Usable),
- __wmemset_chk_avx2_unaligned)
- IFUNC_IMPL_ADD (array, i, __wmemset_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
- __wmemset_chk_avx512_unaligned))
-#endif
-
- return i;
-}
diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
deleted file mode 100644
index d761985a47..0000000000
--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Common definition for wmemset/wmemset_chk ifunc selections.
- Copyright (C) 2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <init-arch.h>
-
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
-
-static inline void *
-IFUNC_SELECTOR (void)
-{
- const struct cpu_features* cpu_features = __get_cpu_features ();
-
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
- && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
- && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
- {
- if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable)
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
- return OPTIMIZE (avx512_unaligned);
- else
- return OPTIMIZE (avx2_unaligned);
- }
-
- return OPTIMIZE (sse2_unaligned);
-}
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
deleted file mode 100644
index 47630dd97b..0000000000
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ /dev/null
@@ -1,425 +0,0 @@
-/* memcmp/wmemcmp optimized with AVX2.
- Copyright (C) 2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-/* memcmp/wmemcmp is implemented as:
- 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
- to avoid branches.
- 2. Use overlapping compare to avoid branch.
- 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
- bytes for wmemcmp.
- 4. If size is 8 * VEC_SIZE or less, unroll the loop.
- 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
- area.
- 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
- 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
- 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
-
-# include <sysdep.h>
-
-# ifndef MEMCMP
-# define MEMCMP __memcmp_avx2_movbe
-# endif
-
-# ifdef USE_AS_WMEMCMP
-# define VPCMPEQ vpcmpeqd
-# else
-# define VPCMPEQ vpcmpeqb
-# endif
-
-# ifndef VZEROUPPER
-# define VZEROUPPER vzeroupper
-# endif
-
-# define VEC_SIZE 32
-# define VEC_MASK ((1 << VEC_SIZE) - 1)
-
-/* Warning!
- wmemcmp has to use SIGNED comparison for elements.
- memcmp has to use UNSIGNED comparison for elemnts.
-*/
-
- .section .text.avx,"ax",@progbits
-ENTRY (MEMCMP)
-# ifdef USE_AS_WMEMCMP
- shl $2, %rdx
-# endif
- cmpq $VEC_SIZE, %rdx
- jb L(less_vec)
- cmpq $(VEC_SIZE * 2), %rdx
- ja L(more_2x_vec)
-
-L(last_2x_vec):
- /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
- vmovdqu (%rsi), %ymm2
- VPCMPEQ (%rdi), %ymm2, %ymm2
- vpmovmskb %ymm2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
-
-L(last_vec):
- /* Use overlapping loads to avoid branches. */
- leaq -VEC_SIZE(%rdi, %rdx), %rdi
- leaq -VEC_SIZE(%rsi, %rdx), %rsi
- vmovdqu (%rsi), %ymm2
- VPCMPEQ (%rdi), %ymm2, %ymm2
- vpmovmskb %ymm2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
- VZEROUPPER
- ret
-
- .p2align 4
-L(first_vec):
- /* A byte or int32 is different within 16 or 32 bytes. */
- tzcntl %eax, %ecx
-# ifdef USE_AS_WMEMCMP
- xorl %eax, %eax
- movl (%rdi, %rcx), %edx
- cmpl (%rsi, %rcx), %edx
-L(wmemcmp_return):
- setl %al
- negl %eax
- orl $1, %eax
-# else
- movzbl (%rdi, %rcx), %eax
- movzbl (%rsi, %rcx), %edx
- sub %edx, %eax
-# endif
- VZEROUPPER
- ret
-
-# ifdef USE_AS_WMEMCMP
- .p2align 4
-L(4):
- xorl %eax, %eax
- movl (%rdi), %edx
- cmpl (%rsi), %edx
- jne L(wmemcmp_return)
- ret
-# else
- .p2align 4
-L(between_4_7):
- /* Load as big endian with overlapping movbe to avoid branches. */
- movbe (%rdi), %eax
- movbe (%rsi), %ecx
- shlq $32, %rax
- shlq $32, %rcx
- movbe -4(%rdi, %rdx), %edi
- movbe -4(%rsi, %rdx), %esi
- orq %rdi, %rax
- orq %rsi, %rcx
- subq %rcx, %rax
- je L(exit)
- sbbl %eax, %eax
- orl $1, %eax
- ret
-
- .p2align 4
-L(exit):
- ret
-
- .p2align 4
-L(between_2_3):
- /* Load as big endian with overlapping loads and bswap to avoid
- branches. */
- movzwl -2(%rdi, %rdx), %eax
- movzwl -2(%rsi, %rdx), %ecx
- shll $16, %eax
- shll $16, %ecx
- movzwl (%rdi), %edi
- movzwl (%rsi), %esi
- orl %edi, %eax
- orl %esi, %ecx
- bswap %eax
- bswap %ecx
- subl %ecx, %eax
- ret
-
- .p2align 4
-L(1):
- movzbl (%rdi), %eax
- movzbl (%rsi), %ecx
- subl %ecx, %eax
- ret
-# endif
-
- .p2align 4
-L(zero):
- xorl %eax, %eax
- ret
-
- .p2align 4
-L(less_vec):
-# ifdef USE_AS_WMEMCMP
- /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */
- cmpb $4, %dl
- je L(4)
- jb L(zero)
-# else
- cmpb $1, %dl
- je L(1)
- jb L(zero)
- cmpb $4, %dl
- jb L(between_2_3)
- cmpb $8, %dl
- jb L(between_4_7)
-# endif
- cmpb $16, %dl
- jae L(between_16_31)
- /* It is between 8 and 15 bytes. */
- vmovq (%rdi), %xmm1
- vmovq (%rsi), %xmm2
- VPCMPEQ %xmm1, %xmm2, %xmm2
- vpmovmskb %xmm2, %eax
- subl $0xffff, %eax
- jnz L(first_vec)
- /* Use overlapping loads to avoid branches. */
- leaq -8(%rdi, %rdx), %rdi
- leaq -8(%rsi, %rdx), %rsi
- vmovq (%rdi), %xmm1
- vmovq (%rsi), %xmm2
- VPCMPEQ %xmm1, %xmm2, %xmm2
- vpmovmskb %xmm2, %eax
- subl $0xffff, %eax
- jnz L(first_vec)
- ret
-
- .p2align 4
-L(between_16_31):
- /* From 16 to 31 bytes. No branch when size == 16. */
- vmovdqu (%rsi), %xmm2
- VPCMPEQ (%rdi), %xmm2, %xmm2
- vpmovmskb %xmm2, %eax
- subl $0xffff, %eax
- jnz L(first_vec)
-
- /* Use overlapping loads to avoid branches. */
- leaq -16(%rdi, %rdx), %rdi
- leaq -16(%rsi, %rdx), %rsi
- vmovdqu (%rsi), %xmm2
- VPCMPEQ (%rdi), %xmm2, %xmm2
- vpmovmskb %xmm2, %eax
- subl $0xffff, %eax
- jnz L(first_vec)
- ret
-
- .p2align 4
-L(more_2x_vec):
- /* More than 2 * VEC. */
- cmpq $(VEC_SIZE * 8), %rdx
- ja L(more_8x_vec)
- cmpq $(VEC_SIZE * 4), %rdx
- jb L(last_4x_vec)
-
- /* From 4 * VEC to 8 * VEC, inclusively. */
- vmovdqu (%rsi), %ymm1
- VPCMPEQ (%rdi), %ymm1, %ymm1
-
- vmovdqu VEC_SIZE(%rsi), %ymm2
- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
-
- vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
-
- vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
-
- vpand %ymm1, %ymm2, %ymm5
- vpand %ymm3, %ymm4, %ymm6
- vpand %ymm5, %ymm6, %ymm5
-
- vpmovmskb %ymm5, %eax
- subl $VEC_MASK, %eax
- jnz L(4x_vec_end)
-
- leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
- leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
- vmovdqu (%rsi), %ymm1
- VPCMPEQ (%rdi), %ymm1, %ymm1
-
- vmovdqu VEC_SIZE(%rsi), %ymm2
- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
- vpand %ymm2, %ymm1, %ymm5
-
- vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
- vpand %ymm3, %ymm5, %ymm5
-
- vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
- vpand %ymm4, %ymm5, %ymm5
-
- vpmovmskb %ymm5, %eax
- subl $VEC_MASK, %eax
- jnz L(4x_vec_end)
- VZEROUPPER
- ret
-
- .p2align 4
-L(more_8x_vec):
- /* More than 8 * VEC. Check the first VEC. */
- vmovdqu (%rsi), %ymm2
- VPCMPEQ (%rdi), %ymm2, %ymm2
- vpmovmskb %ymm2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
-
- /* Align the first memory area for aligned loads in the loop.
- Compute how much the first memory area is misaligned. */
- movq %rdi, %rcx
- andl $(VEC_SIZE - 1), %ecx
- /* Get the negative of offset for alignment. */
- subq $VEC_SIZE, %rcx
- /* Adjust the second memory area. */
- subq %rcx, %rsi
- /* Adjust the first memory area which should be aligned now. */
- subq %rcx, %rdi
- /* Adjust length. */
- addq %rcx, %rdx
-
-L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- vmovdqu (%rsi), %ymm1
- VPCMPEQ (%rdi), %ymm1, %ymm1
-
- vmovdqu VEC_SIZE(%rsi), %ymm2
- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
- vpand %ymm2, %ymm1, %ymm5
-
- vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
- vpand %ymm3, %ymm5, %ymm5
-
- vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
- vpand %ymm4, %ymm5, %ymm5
-
- vpmovmskb %ymm5, %eax
- subl $VEC_MASK, %eax
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
- addq $(VEC_SIZE * 4), %rsi
-
- subq $(VEC_SIZE * 4), %rdx
- cmpq $(VEC_SIZE * 4), %rdx
- jae L(loop_4x_vec)
-
- /* Less than 4 * VEC. */
- cmpq $VEC_SIZE, %rdx
- jbe L(last_vec)
- cmpq $(VEC_SIZE * 2), %rdx
- jbe L(last_2x_vec)
-
-L(last_4x_vec):
- /* From 2 * VEC to 4 * VEC. */
- vmovdqu (%rsi), %ymm2
- VPCMPEQ (%rdi), %ymm2, %ymm2
- vpmovmskb %ymm2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
-
- addq $VEC_SIZE, %rdi
- addq $VEC_SIZE, %rsi
- vmovdqu (%rsi), %ymm2
- VPCMPEQ (%rdi), %ymm2, %ymm2
- vpmovmskb %ymm2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
-
- /* Use overlapping loads to avoid branches. */
- leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi
- leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi
- vmovdqu (%rsi), %ymm2
- VPCMPEQ (%rdi), %ymm2, %ymm2
- vpmovmskb %ymm2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
-
- addq $VEC_SIZE, %rdi
- addq $VEC_SIZE, %rsi
- vmovdqu (%rsi), %ymm2
- VPCMPEQ (%rdi), %ymm2, %ymm2
- vpmovmskb %ymm2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
- VZEROUPPER
- ret
-
- .p2align 4
-L(4x_vec_end):
- vpmovmskb %ymm1, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
- vpmovmskb %ymm2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec_x1)
- vpmovmskb %ymm3, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec_x2)
- vpmovmskb %ymm4, %eax
- subl $VEC_MASK, %eax
- tzcntl %eax, %ecx
-# ifdef USE_AS_WMEMCMP
- xorl %eax, %eax
- movl (VEC_SIZE * 3)(%rdi, %rcx), %edx
- cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx
- jmp L(wmemcmp_return)
-# else
- movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
- movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx
- sub %edx, %eax
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(first_vec_x1):
- tzcntl %eax, %ecx
-# ifdef USE_AS_WMEMCMP
- xorl %eax, %eax
- movl VEC_SIZE(%rdi, %rcx), %edx
- cmpl VEC_SIZE(%rsi, %rcx), %edx
- jmp L(wmemcmp_return)
-# else
- movzbl VEC_SIZE(%rdi, %rcx), %eax
- movzbl VEC_SIZE(%rsi, %rcx), %edx
- sub %edx, %eax
-# endif
- VZEROUPPER
- ret
-
- .p2align 4
-L(first_vec_x2):
- tzcntl %eax, %ecx
-# ifdef USE_AS_WMEMCMP
- xorl %eax, %eax
- movl (VEC_SIZE * 2)(%rdi, %rcx), %edx
- cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx
- jmp L(wmemcmp_return)
-# else
- movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
- movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx
- sub %edx, %eax
-# endif
- VZEROUPPER
- ret
-END (MEMCMP)
-#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
deleted file mode 100644
index 771639f662..0000000000
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ /dev/null
@@ -1,1776 +0,0 @@
-/* memcmp with SSE4.1, wmemcmp with SSE4.1
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef MEMCMP
-# define MEMCMP __memcmp_sse4_1
-# endif
-
-# define JMPTBL(I, B) (I - B)
-
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- lea TABLE(%rip), %r11; \
- movslq (%r11, INDEX, SCALE), %rcx; \
- add %r11, %rcx; \
- jmp *%rcx; \
- ud2
-
-/* Warning!
- wmemcmp has to use SIGNED comparison for elements.
- memcmp has to use UNSIGNED comparison for elemnts.
-*/
-
- .section .text.sse4.1,"ax",@progbits
-ENTRY (MEMCMP)
-# ifdef USE_AS_WMEMCMP
- shl $2, %rdx
-# endif
- pxor %xmm0, %xmm0
- cmp $79, %rdx
- ja L(79bytesormore)
-# ifndef USE_AS_WMEMCMP
- cmp $1, %rdx
- je L(firstbyte)
-# endif
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-# ifndef USE_AS_WMEMCMP
- .p2align 4
-L(firstbyte):
- movzbl (%rdi), %eax
- movzbl (%rsi), %ecx
- sub %ecx, %eax
- ret
-# endif
-
- .p2align 4
-L(79bytesormore):
- movdqu (%rsi), %xmm1
- movdqu (%rdi), %xmm2
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(16bytesin256)
- mov %rsi, %rcx
- and $-16, %rsi
- add $16, %rsi
- sub %rsi, %rcx
-
- sub %rcx, %rdi
- add %rcx, %rdx
- test $0xf, %rdi
- jz L(2aligned)
-
- cmp $128, %rdx
- ja L(128bytesormore)
-L(less128bytes):
- sub $64, %rdx
-
- movdqu (%rdi), %xmm2
- pxor (%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(16bytesin256)
-
- movdqu 16(%rdi), %xmm2
- pxor 16(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(32bytesin256)
-
- movdqu 32(%rdi), %xmm2
- pxor 32(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(48bytesin256)
-
- movdqu 48(%rdi), %xmm2
- pxor 48(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(64bytesin256)
- cmp $32, %rdx
- jb L(less32bytesin64)
-
- movdqu 64(%rdi), %xmm2
- pxor 64(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(80bytesin256)
-
- movdqu 80(%rdi), %xmm2
- pxor 80(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(96bytesin256)
- sub $32, %rdx
- add $32, %rdi
- add $32, %rsi
-L(less32bytesin64):
- add $64, %rdi
- add $64, %rsi
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-L(128bytesormore):
- cmp $512, %rdx
- ja L(512bytesormore)
- cmp $256, %rdx
- ja L(less512bytes)
-L(less256bytes):
- sub $128, %rdx
-
- movdqu (%rdi), %xmm2
- pxor (%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(16bytesin256)
-
- movdqu 16(%rdi), %xmm2
- pxor 16(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(32bytesin256)
-
- movdqu 32(%rdi), %xmm2
- pxor 32(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(48bytesin256)
-
- movdqu 48(%rdi), %xmm2
- pxor 48(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(64bytesin256)
-
- movdqu 64(%rdi), %xmm2
- pxor 64(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(80bytesin256)
-
- movdqu 80(%rdi), %xmm2
- pxor 80(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(96bytesin256)
-
- movdqu 96(%rdi), %xmm2
- pxor 96(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(112bytesin256)
-
- movdqu 112(%rdi), %xmm2
- pxor 112(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(128bytesin256)
-
- add $128, %rsi
- add $128, %rdi
-
- cmp $64, %rdx
- jae L(less128bytes)
-
- cmp $32, %rdx
- jb L(less32bytesin128)
-
- movdqu (%rdi), %xmm2
- pxor (%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(16bytesin256)
-
- movdqu 16(%rdi), %xmm2
- pxor 16(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(32bytesin256)
- sub $32, %rdx
- add $32, %rdi
- add $32, %rsi
-L(less32bytesin128):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-L(less512bytes):
- sub $256, %rdx
- movdqu (%rdi), %xmm2
- pxor (%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(16bytesin256)
-
- movdqu 16(%rdi), %xmm2
- pxor 16(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(32bytesin256)
-
- movdqu 32(%rdi), %xmm2
- pxor 32(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(48bytesin256)
-
- movdqu 48(%rdi), %xmm2
- pxor 48(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(64bytesin256)
-
- movdqu 64(%rdi), %xmm2
- pxor 64(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(80bytesin256)
-
- movdqu 80(%rdi), %xmm2
- pxor 80(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(96bytesin256)
-
- movdqu 96(%rdi), %xmm2
- pxor 96(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(112bytesin256)
-
- movdqu 112(%rdi), %xmm2
- pxor 112(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(128bytesin256)
-
- movdqu 128(%rdi), %xmm2
- pxor 128(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(144bytesin256)
-
- movdqu 144(%rdi), %xmm2
- pxor 144(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(160bytesin256)
-
- movdqu 160(%rdi), %xmm2
- pxor 160(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(176bytesin256)
-
- movdqu 176(%rdi), %xmm2
- pxor 176(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(192bytesin256)
-
- movdqu 192(%rdi), %xmm2
- pxor 192(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(208bytesin256)
-
- movdqu 208(%rdi), %xmm2
- pxor 208(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(224bytesin256)
-
- movdqu 224(%rdi), %xmm2
- pxor 224(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(240bytesin256)
-
- movdqu 240(%rdi), %xmm2
- pxor 240(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(256bytesin256)
-
- add $256, %rsi
- add $256, %rdi
-
- cmp $128, %rdx
- jae L(less256bytes)
-
- cmp $64, %rdx
- jae L(less128bytes)
-
- cmp $32, %rdx
- jb L(less32bytesin256)
-
- movdqu (%rdi), %xmm2
- pxor (%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(16bytesin256)
-
- movdqu 16(%rdi), %xmm2
- pxor 16(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(32bytesin256)
- sub $32, %rdx
- add $32, %rdi
- add $32, %rsi
-L(less32bytesin256):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
- .p2align 4
-L(512bytesormore):
-# ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %R8_LP
-# else
- mov __x86_data_cache_size_half(%rip), %R8_LP
-# endif
- mov %r8, %r9
- shr $1, %r8
- add %r9, %r8
- cmp %r8, %rdx
- ja L(L2_L3_cache_unaglined)
- sub $64, %rdx
- .p2align 4
-L(64bytesormore_loop):
- movdqu (%rdi), %xmm2
- pxor (%rsi), %xmm2
- movdqa %xmm2, %xmm1
-
- movdqu 16(%rdi), %xmm3
- pxor 16(%rsi), %xmm3
- por %xmm3, %xmm1
-
- movdqu 32(%rdi), %xmm4
- pxor 32(%rsi), %xmm4
- por %xmm4, %xmm1
-
- movdqu 48(%rdi), %xmm5
- pxor 48(%rsi), %xmm5
- por %xmm5, %xmm1
-
- ptest %xmm1, %xmm0
- jnc L(64bytesormore_loop_end)
- add $64, %rsi
- add $64, %rdi
- sub $64, %rdx
- jae L(64bytesormore_loop)
-
- add $64, %rdx
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-L(L2_L3_cache_unaglined):
- sub $64, %rdx
- .p2align 4
-L(L2_L3_unaligned_128bytes_loop):
- prefetchnta 0x1c0(%rdi)
- prefetchnta 0x1c0(%rsi)
- movdqu (%rdi), %xmm2
- pxor (%rsi), %xmm2
- movdqa %xmm2, %xmm1
-
- movdqu 16(%rdi), %xmm3
- pxor 16(%rsi), %xmm3
- por %xmm3, %xmm1
-
- movdqu 32(%rdi), %xmm4
- pxor 32(%rsi), %xmm4
- por %xmm4, %xmm1
-
- movdqu 48(%rdi), %xmm5
- pxor 48(%rsi), %xmm5
- por %xmm5, %xmm1
-
- ptest %xmm1, %xmm0
- jnc L(64bytesormore_loop_end)
- add $64, %rsi
- add $64, %rdi
- sub $64, %rdx
- jae L(L2_L3_unaligned_128bytes_loop)
-
- add $64, %rdx
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-/*
- * This case is for machines which are sensitive for unaligned instructions.
- */
- .p2align 4
-L(2aligned):
- cmp $128, %rdx
- ja L(128bytesormorein2aligned)
-L(less128bytesin2aligned):
- sub $64, %rdx
-
- movdqa (%rdi), %xmm2
- pxor (%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(16bytesin256)
-
- movdqa 16(%rdi), %xmm2
- pxor 16(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(32bytesin256)
-
- movdqa 32(%rdi), %xmm2
- pxor 32(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(48bytesin256)
-
- movdqa 48(%rdi), %xmm2
- pxor 48(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(64bytesin256)
- cmp $32, %rdx
- jb L(less32bytesin64in2alinged)
-
- movdqa 64(%rdi), %xmm2
- pxor 64(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(80bytesin256)
-
- movdqa 80(%rdi), %xmm2
- pxor 80(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(96bytesin256)
- sub $32, %rdx
- add $32, %rdi
- add $32, %rsi
-L(less32bytesin64in2alinged):
- add $64, %rdi
- add $64, %rsi
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
- .p2align 4
-L(128bytesormorein2aligned):
- cmp $512, %rdx
- ja L(512bytesormorein2aligned)
- cmp $256, %rdx
- ja L(256bytesormorein2aligned)
-L(less256bytesin2alinged):
- sub $128, %rdx
-
- movdqa (%rdi), %xmm2
- pxor (%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(16bytesin256)
-
- movdqa 16(%rdi), %xmm2
- pxor 16(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(32bytesin256)
-
- movdqa 32(%rdi), %xmm2
- pxor 32(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(48bytesin256)
-
- movdqa 48(%rdi), %xmm2
- pxor 48(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(64bytesin256)
-
- movdqa 64(%rdi), %xmm2
- pxor 64(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(80bytesin256)
-
- movdqa 80(%rdi), %xmm2
- pxor 80(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(96bytesin256)
-
- movdqa 96(%rdi), %xmm2
- pxor 96(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(112bytesin256)
-
- movdqa 112(%rdi), %xmm2
- pxor 112(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(128bytesin256)
-
- add $128, %rsi
- add $128, %rdi
-
- cmp $64, %rdx
- jae L(less128bytesin2aligned)
-
- cmp $32, %rdx
- jb L(less32bytesin128in2aligned)
-
- movdqu (%rdi), %xmm2
- pxor (%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(16bytesin256)
-
- movdqu 16(%rdi), %xmm2
- pxor 16(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(32bytesin256)
- sub $32, %rdx
- add $32, %rdi
- add $32, %rsi
-L(less32bytesin128in2aligned):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
- .p2align 4
-L(256bytesormorein2aligned):
-
- sub $256, %rdx
- movdqa (%rdi), %xmm2
- pxor (%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(16bytesin256)
-
- movdqa 16(%rdi), %xmm2
- pxor 16(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(32bytesin256)
-
- movdqa 32(%rdi), %xmm2
- pxor 32(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(48bytesin256)
-
- movdqa 48(%rdi), %xmm2
- pxor 48(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(64bytesin256)
-
- movdqa 64(%rdi), %xmm2
- pxor 64(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(80bytesin256)
-
- movdqa 80(%rdi), %xmm2
- pxor 80(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(96bytesin256)
-
- movdqa 96(%rdi), %xmm2
- pxor 96(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(112bytesin256)
-
- movdqa 112(%rdi), %xmm2
- pxor 112(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(128bytesin256)
-
- movdqa 128(%rdi), %xmm2
- pxor 128(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(144bytesin256)
-
- movdqa 144(%rdi), %xmm2
- pxor 144(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(160bytesin256)
-
- movdqa 160(%rdi), %xmm2
- pxor 160(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(176bytesin256)
-
- movdqa 176(%rdi), %xmm2
- pxor 176(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(192bytesin256)
-
- movdqa 192(%rdi), %xmm2
- pxor 192(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(208bytesin256)
-
- movdqa 208(%rdi), %xmm2
- pxor 208(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(224bytesin256)
-
- movdqa 224(%rdi), %xmm2
- pxor 224(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(240bytesin256)
-
- movdqa 240(%rdi), %xmm2
- pxor 240(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(256bytesin256)
-
- add $256, %rsi
- add $256, %rdi
-
- cmp $128, %rdx
- jae L(less256bytesin2alinged)
-
- cmp $64, %rdx
- jae L(less128bytesin2aligned)
-
- cmp $32, %rdx
- jb L(less32bytesin256in2alinged)
-
- movdqa (%rdi), %xmm2
- pxor (%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(16bytesin256)
-
- movdqa 16(%rdi), %xmm2
- pxor 16(%rsi), %xmm2
- ptest %xmm2, %xmm0
- jnc L(32bytesin256)
- sub $32, %rdx
- add $32, %rdi
- add $32, %rsi
-L(less32bytesin256in2alinged):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
- .p2align 4
-L(512bytesormorein2aligned):
-# ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %R8_LP
-# else
- mov __x86_data_cache_size_half(%rip), %R8_LP
-# endif
- mov %r8, %r9
- shr $1, %r8
- add %r9, %r8
- cmp %r8, %rdx
- ja L(L2_L3_cache_aglined)
-
- sub $64, %rdx
- .p2align 4
-L(64bytesormore_loopin2aligned):
- movdqa (%rdi), %xmm2
- pxor (%rsi), %xmm2
- movdqa %xmm2, %xmm1
-
- movdqa 16(%rdi), %xmm3
- pxor 16(%rsi), %xmm3
- por %xmm3, %xmm1
-
- movdqa 32(%rdi), %xmm4
- pxor 32(%rsi), %xmm4
- por %xmm4, %xmm1
-
- movdqa 48(%rdi), %xmm5
- pxor 48(%rsi), %xmm5
- por %xmm5, %xmm1
-
- ptest %xmm1, %xmm0
- jnc L(64bytesormore_loop_end)
- add $64, %rsi
- add $64, %rdi
- sub $64, %rdx
- jae L(64bytesormore_loopin2aligned)
-
- add $64, %rdx
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-L(L2_L3_cache_aglined):
- sub $64, %rdx
-
- .p2align 4
-L(L2_L3_aligned_128bytes_loop):
- prefetchnta 0x1c0(%rdi)
- prefetchnta 0x1c0(%rsi)
- movdqa (%rdi), %xmm2
- pxor (%rsi), %xmm2
- movdqa %xmm2, %xmm1
-
- movdqa 16(%rdi), %xmm3
- pxor 16(%rsi), %xmm3
- por %xmm3, %xmm1
-
- movdqa 32(%rdi), %xmm4
- pxor 32(%rsi), %xmm4
- por %xmm4, %xmm1
-
- movdqa 48(%rdi), %xmm5
- pxor 48(%rsi), %xmm5
- por %xmm5, %xmm1
-
- ptest %xmm1, %xmm0
- jnc L(64bytesormore_loop_end)
- add $64, %rsi
- add $64, %rdi
- sub $64, %rdx
- jae L(L2_L3_aligned_128bytes_loop)
-
- add $64, %rdx
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-
- .p2align 4
-L(64bytesormore_loop_end):
- add $16, %rdi
- add $16, %rsi
- ptest %xmm2, %xmm0
- jnc L(16bytes)
-
- add $16, %rdi
- add $16, %rsi
- ptest %xmm3, %xmm0
- jnc L(16bytes)
-
- add $16, %rdi
- add $16, %rsi
- ptest %xmm4, %xmm0
- jnc L(16bytes)
-
- add $16, %rdi
- add $16, %rsi
- jmp L(16bytes)
-
-L(256bytesin256):
- add $256, %rdi
- add $256, %rsi
- jmp L(16bytes)
-L(240bytesin256):
- add $240, %rdi
- add $240, %rsi
- jmp L(16bytes)
-L(224bytesin256):
- add $224, %rdi
- add $224, %rsi
- jmp L(16bytes)
-L(208bytesin256):
- add $208, %rdi
- add $208, %rsi
- jmp L(16bytes)
-L(192bytesin256):
- add $192, %rdi
- add $192, %rsi
- jmp L(16bytes)
-L(176bytesin256):
- add $176, %rdi
- add $176, %rsi
- jmp L(16bytes)
-L(160bytesin256):
- add $160, %rdi
- add $160, %rsi
- jmp L(16bytes)
-L(144bytesin256):
- add $144, %rdi
- add $144, %rsi
- jmp L(16bytes)
-L(128bytesin256):
- add $128, %rdi
- add $128, %rsi
- jmp L(16bytes)
-L(112bytesin256):
- add $112, %rdi
- add $112, %rsi
- jmp L(16bytes)
-L(96bytesin256):
- add $96, %rdi
- add $96, %rsi
- jmp L(16bytes)
-L(80bytesin256):
- add $80, %rdi
- add $80, %rsi
- jmp L(16bytes)
-L(64bytesin256):
- add $64, %rdi
- add $64, %rsi
- jmp L(16bytes)
-L(48bytesin256):
- add $16, %rdi
- add $16, %rsi
-L(32bytesin256):
- add $16, %rdi
- add $16, %rsi
-L(16bytesin256):
- add $16, %rdi
- add $16, %rsi
-L(16bytes):
- mov -16(%rdi), %rax
- mov -16(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
-L(8bytes):
- mov -8(%rdi), %rax
- mov -8(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(12bytes):
- mov -12(%rdi), %rax
- mov -12(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
-L(4bytes):
- mov -4(%rsi), %ecx
-# ifndef USE_AS_WMEMCMP
- mov -4(%rdi), %eax
- cmp %eax, %ecx
-# else
- cmp -4(%rdi), %ecx
-# endif
- jne L(diffin4bytes)
-L(0bytes):
- xor %eax, %eax
- ret
-
-# ifndef USE_AS_WMEMCMP
-/* unreal case for wmemcmp */
- .p2align 4
-L(65bytes):
- movdqu -65(%rdi), %xmm1
- movdqu -65(%rsi), %xmm2
- mov $-65, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(49bytes):
- movdqu -49(%rdi), %xmm1
- movdqu -49(%rsi), %xmm2
- mov $-49, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(33bytes):
- movdqu -33(%rdi), %xmm1
- movdqu -33(%rsi), %xmm2
- mov $-33, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(17bytes):
- mov -17(%rdi), %rax
- mov -17(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
-L(9bytes):
- mov -9(%rdi), %rax
- mov -9(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- movzbl -1(%rdi), %eax
- movzbl -1(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(13bytes):
- mov -13(%rdi), %rax
- mov -13(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- mov -8(%rdi), %rax
- mov -8(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(5bytes):
- mov -5(%rdi), %eax
- mov -5(%rsi), %ecx
- cmp %eax, %ecx
- jne L(diffin4bytes)
- movzbl -1(%rdi), %eax
- movzbl -1(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(66bytes):
- movdqu -66(%rdi), %xmm1
- movdqu -66(%rsi), %xmm2
- mov $-66, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(50bytes):
- movdqu -50(%rdi), %xmm1
- movdqu -50(%rsi), %xmm2
- mov $-50, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(34bytes):
- movdqu -34(%rdi), %xmm1
- movdqu -34(%rsi), %xmm2
- mov $-34, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(18bytes):
- mov -18(%rdi), %rax
- mov -18(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
-L(10bytes):
- mov -10(%rdi), %rax
- mov -10(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- movzwl -2(%rdi), %eax
- movzwl -2(%rsi), %ecx
- cmp %cl, %al
- jne L(end)
- and $0xffff, %eax
- and $0xffff, %ecx
- sub %ecx, %eax
- ret
-
- .p2align 4
-L(14bytes):
- mov -14(%rdi), %rax
- mov -14(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- mov -8(%rdi), %rax
- mov -8(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(6bytes):
- mov -6(%rdi), %eax
- mov -6(%rsi), %ecx
- cmp %eax, %ecx
- jne L(diffin4bytes)
-L(2bytes):
- movzwl -2(%rsi), %ecx
- movzwl -2(%rdi), %eax
- cmp %cl, %al
- jne L(end)
- and $0xffff, %eax
- and $0xffff, %ecx
- sub %ecx, %eax
- ret
-
- .p2align 4
-L(67bytes):
- movdqu -67(%rdi), %xmm2
- movdqu -67(%rsi), %xmm1
- mov $-67, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(51bytes):
- movdqu -51(%rdi), %xmm2
- movdqu -51(%rsi), %xmm1
- mov $-51, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(35bytes):
- movdqu -35(%rsi), %xmm1
- movdqu -35(%rdi), %xmm2
- mov $-35, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(19bytes):
- mov -19(%rdi), %rax
- mov -19(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
-L(11bytes):
- mov -11(%rdi), %rax
- mov -11(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- mov -4(%rdi), %eax
- mov -4(%rsi), %ecx
- cmp %eax, %ecx
- jne L(diffin4bytes)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(15bytes):
- mov -15(%rdi), %rax
- mov -15(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- mov -8(%rdi), %rax
- mov -8(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(7bytes):
- mov -7(%rdi), %eax
- mov -7(%rsi), %ecx
- cmp %eax, %ecx
- jne L(diffin4bytes)
- mov -4(%rdi), %eax
- mov -4(%rsi), %ecx
- cmp %eax, %ecx
- jne L(diffin4bytes)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(3bytes):
- movzwl -3(%rdi), %eax
- movzwl -3(%rsi), %ecx
- cmp %eax, %ecx
- jne L(diffin2bytes)
-L(1bytes):
- movzbl -1(%rdi), %eax
- movzbl -1(%rsi), %ecx
- sub %ecx, %eax
- ret
-# endif
-
- .p2align 4
-L(68bytes):
- movdqu -68(%rdi), %xmm2
- movdqu -68(%rsi), %xmm1
- mov $-68, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(52bytes):
- movdqu -52(%rdi), %xmm2
- movdqu -52(%rsi), %xmm1
- mov $-52, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(36bytes):
- movdqu -36(%rdi), %xmm2
- movdqu -36(%rsi), %xmm1
- mov $-36, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(20bytes):
- movdqu -20(%rdi), %xmm2
- movdqu -20(%rsi), %xmm1
- mov $-20, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
- mov -4(%rsi), %ecx
-
-# ifndef USE_AS_WMEMCMP
- mov -4(%rdi), %eax
- cmp %eax, %ecx
-# else
- cmp -4(%rdi), %ecx
-# endif
- jne L(diffin4bytes)
- xor %eax, %eax
- ret
-
-# ifndef USE_AS_WMEMCMP
-/* unreal cases for wmemcmp */
- .p2align 4
-L(69bytes):
- movdqu -69(%rsi), %xmm1
- movdqu -69(%rdi), %xmm2
- mov $-69, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(53bytes):
- movdqu -53(%rsi), %xmm1
- movdqu -53(%rdi), %xmm2
- mov $-53, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(37bytes):
- movdqu -37(%rsi), %xmm1
- movdqu -37(%rdi), %xmm2
- mov $-37, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(21bytes):
- movdqu -21(%rsi), %xmm1
- movdqu -21(%rdi), %xmm2
- mov $-21, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
- mov -8(%rdi), %rax
- mov -8(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(70bytes):
- movdqu -70(%rsi), %xmm1
- movdqu -70(%rdi), %xmm2
- mov $-70, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(54bytes):
- movdqu -54(%rsi), %xmm1
- movdqu -54(%rdi), %xmm2
- mov $-54, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(38bytes):
- movdqu -38(%rsi), %xmm1
- movdqu -38(%rdi), %xmm2
- mov $-38, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(22bytes):
- movdqu -22(%rsi), %xmm1
- movdqu -22(%rdi), %xmm2
- mov $-22, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
- mov -8(%rdi), %rax
- mov -8(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(71bytes):
- movdqu -71(%rsi), %xmm1
- movdqu -71(%rdi), %xmm2
- mov $-71, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(55bytes):
- movdqu -55(%rdi), %xmm2
- movdqu -55(%rsi), %xmm1
- mov $-55, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(39bytes):
- movdqu -39(%rdi), %xmm2
- movdqu -39(%rsi), %xmm1
- mov $-39, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(23bytes):
- movdqu -23(%rdi), %xmm2
- movdqu -23(%rsi), %xmm1
- mov $-23, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
- mov -8(%rdi), %rax
- mov -8(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- xor %eax, %eax
- ret
-# endif
-
- .p2align 4
-L(72bytes):
- movdqu -72(%rsi), %xmm1
- movdqu -72(%rdi), %xmm2
- mov $-72, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(56bytes):
- movdqu -56(%rdi), %xmm2
- movdqu -56(%rsi), %xmm1
- mov $-56, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(40bytes):
- movdqu -40(%rdi), %xmm2
- movdqu -40(%rsi), %xmm1
- mov $-40, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(24bytes):
- movdqu -24(%rdi), %xmm2
- movdqu -24(%rsi), %xmm1
- mov $-24, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-
- mov -8(%rsi), %rcx
- mov -8(%rdi), %rax
- cmp %rax, %rcx
- jne L(diffin8bytes)
- xor %eax, %eax
- ret
-
-# ifndef USE_AS_WMEMCMP
-/* unreal cases for wmemcmp */
- .p2align 4
-L(73bytes):
- movdqu -73(%rsi), %xmm1
- movdqu -73(%rdi), %xmm2
- mov $-73, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(57bytes):
- movdqu -57(%rdi), %xmm2
- movdqu -57(%rsi), %xmm1
- mov $-57, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(41bytes):
- movdqu -41(%rdi), %xmm2
- movdqu -41(%rsi), %xmm1
- mov $-41, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(25bytes):
- movdqu -25(%rdi), %xmm2
- movdqu -25(%rsi), %xmm1
- mov $-25, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
- mov -9(%rdi), %rax
- mov -9(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- movzbl -1(%rdi), %eax
- movzbl -1(%rsi), %ecx
- sub %ecx, %eax
- ret
-
- .p2align 4
-L(74bytes):
- movdqu -74(%rsi), %xmm1
- movdqu -74(%rdi), %xmm2
- mov $-74, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(58bytes):
- movdqu -58(%rdi), %xmm2
- movdqu -58(%rsi), %xmm1
- mov $-58, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(42bytes):
- movdqu -42(%rdi), %xmm2
- movdqu -42(%rsi), %xmm1
- mov $-42, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(26bytes):
- movdqu -26(%rdi), %xmm2
- movdqu -26(%rsi), %xmm1
- mov $-26, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
- mov -10(%rdi), %rax
- mov -10(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- movzwl -2(%rdi), %eax
- movzwl -2(%rsi), %ecx
- jmp L(diffin2bytes)
-
- .p2align 4
-L(75bytes):
- movdqu -75(%rsi), %xmm1
- movdqu -75(%rdi), %xmm2
- mov $-75, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(59bytes):
- movdqu -59(%rdi), %xmm2
- movdqu -59(%rsi), %xmm1
- mov $-59, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(43bytes):
- movdqu -43(%rdi), %xmm2
- movdqu -43(%rsi), %xmm1
- mov $-43, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(27bytes):
- movdqu -27(%rdi), %xmm2
- movdqu -27(%rsi), %xmm1
- mov $-27, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
- mov -11(%rdi), %rax
- mov -11(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- mov -4(%rdi), %eax
- mov -4(%rsi), %ecx
- cmp %eax, %ecx
- jne L(diffin4bytes)
- xor %eax, %eax
- ret
-# endif
- .p2align 4
-L(76bytes):
- movdqu -76(%rsi), %xmm1
- movdqu -76(%rdi), %xmm2
- mov $-76, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(60bytes):
- movdqu -60(%rdi), %xmm2
- movdqu -60(%rsi), %xmm1
- mov $-60, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(44bytes):
- movdqu -44(%rdi), %xmm2
- movdqu -44(%rsi), %xmm1
- mov $-44, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(28bytes):
- movdqu -28(%rdi), %xmm2
- movdqu -28(%rsi), %xmm1
- mov $-28, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
- mov -12(%rdi), %rax
- mov -12(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- mov -4(%rsi), %ecx
-# ifndef USE_AS_WMEMCMP
- mov -4(%rdi), %eax
- cmp %eax, %ecx
-# else
- cmp -4(%rdi), %ecx
-# endif
- jne L(diffin4bytes)
- xor %eax, %eax
- ret
-
-# ifndef USE_AS_WMEMCMP
-/* unreal cases for wmemcmp */
- .p2align 4
-L(77bytes):
- movdqu -77(%rsi), %xmm1
- movdqu -77(%rdi), %xmm2
- mov $-77, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(61bytes):
- movdqu -61(%rdi), %xmm2
- movdqu -61(%rsi), %xmm1
- mov $-61, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(45bytes):
- movdqu -45(%rdi), %xmm2
- movdqu -45(%rsi), %xmm1
- mov $-45, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(29bytes):
- movdqu -29(%rdi), %xmm2
- movdqu -29(%rsi), %xmm1
- mov $-29, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-
- mov -13(%rdi), %rax
- mov -13(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
-
- mov -8(%rdi), %rax
- mov -8(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(78bytes):
- movdqu -78(%rsi), %xmm1
- movdqu -78(%rdi), %xmm2
- mov $-78, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(62bytes):
- movdqu -62(%rdi), %xmm2
- movdqu -62(%rsi), %xmm1
- mov $-62, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(46bytes):
- movdqu -46(%rdi), %xmm2
- movdqu -46(%rsi), %xmm1
- mov $-46, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(30bytes):
- movdqu -30(%rdi), %xmm2
- movdqu -30(%rsi), %xmm1
- mov $-30, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
- mov -14(%rdi), %rax
- mov -14(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- mov -8(%rdi), %rax
- mov -8(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(79bytes):
- movdqu -79(%rsi), %xmm1
- movdqu -79(%rdi), %xmm2
- mov $-79, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(63bytes):
- movdqu -63(%rdi), %xmm2
- movdqu -63(%rsi), %xmm1
- mov $-63, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(47bytes):
- movdqu -47(%rdi), %xmm2
- movdqu -47(%rsi), %xmm1
- mov $-47, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(31bytes):
- movdqu -31(%rdi), %xmm2
- movdqu -31(%rsi), %xmm1
- mov $-31, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
- mov -15(%rdi), %rax
- mov -15(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- mov -8(%rdi), %rax
- mov -8(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- xor %eax, %eax
- ret
-# endif
- .p2align 4
-L(64bytes):
- movdqu -64(%rdi), %xmm2
- movdqu -64(%rsi), %xmm1
- mov $-64, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(48bytes):
- movdqu -48(%rdi), %xmm2
- movdqu -48(%rsi), %xmm1
- mov $-48, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-L(32bytes):
- movdqu -32(%rdi), %xmm2
- movdqu -32(%rsi), %xmm1
- mov $-32, %dl
- pxor %xmm1, %xmm2
- ptest %xmm2, %xmm0
- jnc L(less16bytes)
-
- mov -16(%rdi), %rax
- mov -16(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
-
- mov -8(%rdi), %rax
- mov -8(%rsi), %rcx
- cmp %rax, %rcx
- jne L(diffin8bytes)
- xor %eax, %eax
- ret
-
-/*
- * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
- */
- .p2align 3
-L(less16bytes):
- movsbq %dl, %rdx
- mov (%rsi, %rdx), %rcx
- mov (%rdi, %rdx), %rax
- cmp %rax, %rcx
- jne L(diffin8bytes)
- mov 8(%rsi, %rdx), %rcx
- mov 8(%rdi, %rdx), %rax
-L(diffin8bytes):
- cmp %eax, %ecx
- jne L(diffin4bytes)
- shr $32, %rcx
- shr $32, %rax
-
-# ifdef USE_AS_WMEMCMP
-/* for wmemcmp */
- cmp %eax, %ecx
- jne L(diffin4bytes)
- xor %eax, %eax
- ret
-# endif
-
-L(diffin4bytes):
-# ifndef USE_AS_WMEMCMP
- cmp %cx, %ax
- jne L(diffin2bytes)
- shr $16, %ecx
- shr $16, %eax
-L(diffin2bytes):
- cmp %cl, %al
- jne L(end)
- and $0xffff, %eax
- and $0xffff, %ecx
- sub %ecx, %eax
- ret
-
- .p2align 4
-L(end):
- and $0xff, %eax
- and $0xff, %ecx
- sub %ecx, %eax
- ret
-# else
-
-/* for wmemcmp */
- mov $1, %eax
- jl L(nequal_bigger)
- neg %eax
- ret
-
- .p2align 4
-L(nequal_bigger):
- ret
-
-L(unreal_case):
- xor %eax, %eax
- ret
-# endif
-
-END (MEMCMP)
-
- .section .rodata.sse4.1,"a",@progbits
- .p2align 3
-# ifndef USE_AS_WMEMCMP
-L(table_64bytes):
- .int JMPTBL (L(0bytes), L(table_64bytes))
- .int JMPTBL (L(1bytes), L(table_64bytes))
- .int JMPTBL (L(2bytes), L(table_64bytes))
- .int JMPTBL (L(3bytes), L(table_64bytes))
- .int JMPTBL (L(4bytes), L(table_64bytes))
- .int JMPTBL (L(5bytes), L(table_64bytes))
- .int JMPTBL (L(6bytes), L(table_64bytes))
- .int JMPTBL (L(7bytes), L(table_64bytes))
- .int JMPTBL (L(8bytes), L(table_64bytes))
- .int JMPTBL (L(9bytes), L(table_64bytes))
- .int JMPTBL (L(10bytes), L(table_64bytes))
- .int JMPTBL (L(11bytes), L(table_64bytes))
- .int JMPTBL (L(12bytes), L(table_64bytes))
- .int JMPTBL (L(13bytes), L(table_64bytes))
- .int JMPTBL (L(14bytes), L(table_64bytes))
- .int JMPTBL (L(15bytes), L(table_64bytes))
- .int JMPTBL (L(16bytes), L(table_64bytes))
- .int JMPTBL (L(17bytes), L(table_64bytes))
- .int JMPTBL (L(18bytes), L(table_64bytes))
- .int JMPTBL (L(19bytes), L(table_64bytes))
- .int JMPTBL (L(20bytes), L(table_64bytes))
- .int JMPTBL (L(21bytes), L(table_64bytes))
- .int JMPTBL (L(22bytes), L(table_64bytes))
- .int JMPTBL (L(23bytes), L(table_64bytes))
- .int JMPTBL (L(24bytes), L(table_64bytes))
- .int JMPTBL (L(25bytes), L(table_64bytes))
- .int JMPTBL (L(26bytes), L(table_64bytes))
- .int JMPTBL (L(27bytes), L(table_64bytes))
- .int JMPTBL (L(28bytes), L(table_64bytes))
- .int JMPTBL (L(29bytes), L(table_64bytes))
- .int JMPTBL (L(30bytes), L(table_64bytes))
- .int JMPTBL (L(31bytes), L(table_64bytes))
- .int JMPTBL (L(32bytes), L(table_64bytes))
- .int JMPTBL (L(33bytes), L(table_64bytes))
- .int JMPTBL (L(34bytes), L(table_64bytes))
- .int JMPTBL (L(35bytes), L(table_64bytes))
- .int JMPTBL (L(36bytes), L(table_64bytes))
- .int JMPTBL (L(37bytes), L(table_64bytes))
- .int JMPTBL (L(38bytes), L(table_64bytes))
- .int JMPTBL (L(39bytes), L(table_64bytes))
- .int JMPTBL (L(40bytes), L(table_64bytes))
- .int JMPTBL (L(41bytes), L(table_64bytes))
- .int JMPTBL (L(42bytes), L(table_64bytes))
- .int JMPTBL (L(43bytes), L(table_64bytes))
- .int JMPTBL (L(44bytes), L(table_64bytes))
- .int JMPTBL (L(45bytes), L(table_64bytes))
- .int JMPTBL (L(46bytes), L(table_64bytes))
- .int JMPTBL (L(47bytes), L(table_64bytes))
- .int JMPTBL (L(48bytes), L(table_64bytes))
- .int JMPTBL (L(49bytes), L(table_64bytes))
- .int JMPTBL (L(50bytes), L(table_64bytes))
- .int JMPTBL (L(51bytes), L(table_64bytes))
- .int JMPTBL (L(52bytes), L(table_64bytes))
- .int JMPTBL (L(53bytes), L(table_64bytes))
- .int JMPTBL (L(54bytes), L(table_64bytes))
- .int JMPTBL (L(55bytes), L(table_64bytes))
- .int JMPTBL (L(56bytes), L(table_64bytes))
- .int JMPTBL (L(57bytes), L(table_64bytes))
- .int JMPTBL (L(58bytes), L(table_64bytes))
- .int JMPTBL (L(59bytes), L(table_64bytes))
- .int JMPTBL (L(60bytes), L(table_64bytes))
- .int JMPTBL (L(61bytes), L(table_64bytes))
- .int JMPTBL (L(62bytes), L(table_64bytes))
- .int JMPTBL (L(63bytes), L(table_64bytes))
- .int JMPTBL (L(64bytes), L(table_64bytes))
- .int JMPTBL (L(65bytes), L(table_64bytes))
- .int JMPTBL (L(66bytes), L(table_64bytes))
- .int JMPTBL (L(67bytes), L(table_64bytes))
- .int JMPTBL (L(68bytes), L(table_64bytes))
- .int JMPTBL (L(69bytes), L(table_64bytes))
- .int JMPTBL (L(70bytes), L(table_64bytes))
- .int JMPTBL (L(71bytes), L(table_64bytes))
- .int JMPTBL (L(72bytes), L(table_64bytes))
- .int JMPTBL (L(73bytes), L(table_64bytes))
- .int JMPTBL (L(74bytes), L(table_64bytes))
- .int JMPTBL (L(75bytes), L(table_64bytes))
- .int JMPTBL (L(76bytes), L(table_64bytes))
- .int JMPTBL (L(77bytes), L(table_64bytes))
- .int JMPTBL (L(78bytes), L(table_64bytes))
- .int JMPTBL (L(79bytes), L(table_64bytes))
-# else
-L(table_64bytes):
- .int JMPTBL (L(0bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(4bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(8bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(12bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(16bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(20bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(24bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(28bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(32bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(36bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(40bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(44bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(48bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(52bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(56bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(60bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(64bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(68bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(72bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(76bytes), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
- .int JMPTBL (L(unreal_case), L(table_64bytes))
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
deleted file mode 100644
index 8d7d2fe67b..0000000000
--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+++ /dev/null
@@ -1,1990 +0,0 @@
-/* memcmp with SSSE3, wmemcmp with SSSE3
- Copyright (C) 2011-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef MEMCMP
-# define MEMCMP __memcmp_ssse3
-# endif
-
-/* Warning!
- wmemcmp has to use SIGNED comparison for elements.
- memcmp has to use UNSIGNED comparison for elemnts.
-*/
-
- atom_text_section
-ENTRY (MEMCMP)
-# ifdef USE_AS_WMEMCMP
- shl $2, %rdx
- test %rdx, %rdx
- jz L(equal)
-# endif
- mov %rdx, %rcx
- mov %rdi, %rdx
- cmp $48, %rcx;
- jae L(48bytesormore) /* LEN => 48 */
-
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-/* ECX >= 32. */
-L(48bytesormore):
- movdqu (%rdi), %xmm3
- movdqu (%rsi), %xmm0
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %edx
- lea 16(%rdi), %rdi
- lea 16(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(less16bytes)
- mov %edi, %edx
- and $0xf, %edx
- xor %rdx, %rdi
- sub %rdx, %rsi
- add %rdx, %rcx
- mov %esi, %edx
- and $0xf, %edx
- jz L(shr_0)
- xor %rdx, %rsi
-
-# ifndef USE_AS_WMEMCMP
- cmp $8, %edx
- jae L(next_unaligned_table)
- cmp $0, %edx
- je L(shr_0)
- cmp $1, %edx
- je L(shr_1)
- cmp $2, %edx
- je L(shr_2)
- cmp $3, %edx
- je L(shr_3)
- cmp $4, %edx
- je L(shr_4)
- cmp $5, %edx
- je L(shr_5)
- cmp $6, %edx
- je L(shr_6)
- jmp L(shr_7)
-
- .p2align 2
-L(next_unaligned_table):
- cmp $8, %edx
- je L(shr_8)
- cmp $9, %edx
- je L(shr_9)
- cmp $10, %edx
- je L(shr_10)
- cmp $11, %edx
- je L(shr_11)
- cmp $12, %edx
- je L(shr_12)
- cmp $13, %edx
- je L(shr_13)
- cmp $14, %edx
- je L(shr_14)
- jmp L(shr_15)
-# else
- cmp $0, %edx
- je L(shr_0)
- cmp $4, %edx
- je L(shr_4)
- cmp $8, %edx
- je L(shr_8)
- jmp L(shr_12)
-# endif
-
- .p2align 4
-L(shr_0):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- jae L(shr_0_gobble)
- xor %eax, %eax
- movdqa (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
- movdqa 16(%rsi), %xmm2
- pcmpeqb 16(%rdi), %xmm2
- pand %xmm1, %xmm2
- pmovmskb %xmm2, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_0_gobble):
- movdqa (%rsi), %xmm0
- xor %eax, %eax
- pcmpeqb (%rdi), %xmm0
- sub $32, %rcx
- movdqa 16(%rsi), %xmm2
- pcmpeqb 16(%rdi), %xmm2
-L(shr_0_gobble_loop):
- pand %xmm0, %xmm2
- sub $32, %rcx
- pmovmskb %xmm2, %edx
- movdqa %xmm0, %xmm1
- movdqa 32(%rsi), %xmm0
- movdqa 48(%rsi), %xmm2
- sbb $0xffff, %edx
- pcmpeqb 32(%rdi), %xmm0
- pcmpeqb 48(%rdi), %xmm2
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- jz L(shr_0_gobble_loop)
-
- pand %xmm0, %xmm2
- cmp $0, %rcx
- jge L(next)
- inc %edx
- add $32, %rcx
-L(next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm2, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_1):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_1_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $1, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $1, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $1, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_1_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $1, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $1, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_1_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $1, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $1, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_1_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_1_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_1_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 1(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-
- .p2align 4
-L(shr_2):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_2_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $2, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $2, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $2, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_2_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $2, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $2, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_2_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $2, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $2, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_2_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_2_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_2_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 2(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_3):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_3_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $3, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $3, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $3, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_3_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $3, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $3, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_3_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $3, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $3, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_3_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_3_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_3_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 3(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# endif
-
- .p2align 4
-L(shr_4):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_4_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $4, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $4, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $4, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_4_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $4, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $4, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_4_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $4, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $4, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_4_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_4_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_4_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 4(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_5):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_5_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $5, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $5, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $5, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_5_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $5, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $5, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_5_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $5, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $5, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_5_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_5_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_5_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 5(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_6):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_6_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $6, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $6, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $6, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_6_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $6, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $6, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_6_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $6, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $6, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_6_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_6_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_6_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 6(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_7):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_7_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $7, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $7, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $7, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_7_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $7, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $7, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_7_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $7, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $7, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_7_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_7_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_7_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 7(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# endif
-
- .p2align 4
-L(shr_8):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_8_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $8, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $8, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $8, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_8_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $8, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $8, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_8_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $8, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $8, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_8_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_8_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_8_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 8(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_9):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_9_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $9, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $9, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $9, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_9_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $9, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $9, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_9_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $9, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $9, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_9_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_9_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_9_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 9(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_10):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_10_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $10, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $10, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $10, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_10_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $10, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $10, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_10_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $10, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $10, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_10_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_10_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_10_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 10(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_11):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_11_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $11, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $11, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $11, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_11_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $11, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $11, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_11_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $11, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $11, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_11_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_11_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_11_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 11(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# endif
-
- .p2align 4
-L(shr_12):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_12_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $12, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $12, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $12, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_12_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $12, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $12, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_12_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $12, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $12, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_12_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_12_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_12_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 12(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_13):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_13_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $13, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $13, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $13, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_13_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $13, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $13, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_13_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $13, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $13, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_13_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_13_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_13_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 13(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_14):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_14_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $14, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $14, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $14, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_14_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $14, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $14, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_14_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $14, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $14, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_14_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_14_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_14_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 14(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_15):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_15_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $15, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $15, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $15, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_15_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $15, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $15, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_15_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $15, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $15, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_15_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_15_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_15_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 15(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-# endif
- .p2align 4
-L(exit):
- pmovmskb %xmm1, %r8d
- sub $0xffff, %r8d
- jz L(first16bytes)
- lea -16(%rsi), %rsi
- lea -16(%rdi), %rdi
- mov %r8d, %edx
-L(first16bytes):
- add %rax, %rsi
-L(less16bytes):
-# ifndef USE_AS_WMEMCMP
- test %dl, %dl
- jz L(next_24_bytes)
-
- test $0x01, %dl
- jnz L(Byte16)
-
- test $0x02, %dl
- jnz L(Byte17)
-
- test $0x04, %dl
- jnz L(Byte18)
-
- test $0x08, %dl
- jnz L(Byte19)
-
- test $0x10, %dl
- jnz L(Byte20)
-
- test $0x20, %dl
- jnz L(Byte21)
-
- test $0x40, %dl
- jnz L(Byte22)
-
- movzbl -9(%rdi), %eax
- movzbl -9(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte16):
- movzbl -16(%rdi), %eax
- movzbl -16(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte17):
- movzbl -15(%rdi), %eax
- movzbl -15(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte18):
- movzbl -14(%rdi), %eax
- movzbl -14(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte19):
- movzbl -13(%rdi), %eax
- movzbl -13(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte20):
- movzbl -12(%rdi), %eax
- movzbl -12(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte21):
- movzbl -11(%rdi), %eax
- movzbl -11(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte22):
- movzbl -10(%rdi), %eax
- movzbl -10(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(next_24_bytes):
- lea 8(%rdi), %rdi
- lea 8(%rsi), %rsi
- test $0x01, %dh
- jnz L(Byte16)
-
- test $0x02, %dh
- jnz L(Byte17)
-
- test $0x04, %dh
- jnz L(Byte18)
-
- test $0x08, %dh
- jnz L(Byte19)
-
- test $0x10, %dh
- jnz L(Byte20)
-
- test $0x20, %dh
- jnz L(Byte21)
-
- test $0x40, %dh
- jnz L(Byte22)
-
- movzbl -9(%rdi), %eax
- movzbl -9(%rsi), %edx
- sub %edx, %eax
- ret
-# else
-/* special for wmemcmp */
- xor %eax, %eax
- test %dl, %dl
- jz L(next_two_double_words)
- and $15, %dl
- jz L(second_double_word)
- mov -16(%rdi), %eax
- cmp -16(%rsi), %eax
- jne L(find_diff)
- ret
-
- .p2align 4
-L(second_double_word):
- mov -12(%rdi), %eax
- cmp -12(%rsi), %eax
- jne L(find_diff)
- ret
-
- .p2align 4
-L(next_two_double_words):
- and $15, %dh
- jz L(fourth_double_word)
- mov -8(%rdi), %eax
- cmp -8(%rsi), %eax
- jne L(find_diff)
- ret
-
- .p2align 4
-L(fourth_double_word):
- mov -4(%rdi), %eax
- cmp -4(%rsi), %eax
- jne L(find_diff)
- ret
-# endif
-
- .p2align 4
-L(less48bytes):
- cmp $8, %ecx
- jae L(more8bytes)
- cmp $0, %ecx
- je L(0bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $1, %ecx
- je L(1bytes)
- cmp $2, %ecx
- je L(2bytes)
- cmp $3, %ecx
- je L(3bytes)
- cmp $4, %ecx
- je L(4bytes)
- cmp $5, %ecx
- je L(5bytes)
- cmp $6, %ecx
- je L(6bytes)
- jmp L(7bytes)
-# else
- jmp L(4bytes)
-# endif
-
- .p2align 4
-L(more8bytes):
- cmp $16, %ecx
- jae L(more16bytes)
- cmp $8, %ecx
- je L(8bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $9, %ecx
- je L(9bytes)
- cmp $10, %ecx
- je L(10bytes)
- cmp $11, %ecx
- je L(11bytes)
- cmp $12, %ecx
- je L(12bytes)
- cmp $13, %ecx
- je L(13bytes)
- cmp $14, %ecx
- je L(14bytes)
- jmp L(15bytes)
-# else
- jmp L(12bytes)
-# endif
-
- .p2align 4
-L(more16bytes):
- cmp $24, %ecx
- jae L(more24bytes)
- cmp $16, %ecx
- je L(16bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $17, %ecx
- je L(17bytes)
- cmp $18, %ecx
- je L(18bytes)
- cmp $19, %ecx
- je L(19bytes)
- cmp $20, %ecx
- je L(20bytes)
- cmp $21, %ecx
- je L(21bytes)
- cmp $22, %ecx
- je L(22bytes)
- jmp L(23bytes)
-# else
- jmp L(20bytes)
-# endif
-
- .p2align 4
-L(more24bytes):
- cmp $32, %ecx
- jae L(more32bytes)
- cmp $24, %ecx
- je L(24bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $25, %ecx
- je L(25bytes)
- cmp $26, %ecx
- je L(26bytes)
- cmp $27, %ecx
- je L(27bytes)
- cmp $28, %ecx
- je L(28bytes)
- cmp $29, %ecx
- je L(29bytes)
- cmp $30, %ecx
- je L(30bytes)
- jmp L(31bytes)
-# else
- jmp L(28bytes)
-# endif
-
- .p2align 4
-L(more32bytes):
- cmp $40, %ecx
- jae L(more40bytes)
- cmp $32, %ecx
- je L(32bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $33, %ecx
- je L(33bytes)
- cmp $34, %ecx
- je L(34bytes)
- cmp $35, %ecx
- je L(35bytes)
- cmp $36, %ecx
- je L(36bytes)
- cmp $37, %ecx
- je L(37bytes)
- cmp $38, %ecx
- je L(38bytes)
- jmp L(39bytes)
-# else
- jmp L(36bytes)
-# endif
-
- .p2align 4
-L(more40bytes):
- cmp $40, %ecx
- je L(40bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $41, %ecx
- je L(41bytes)
- cmp $42, %ecx
- je L(42bytes)
- cmp $43, %ecx
- je L(43bytes)
- cmp $44, %ecx
- je L(44bytes)
- cmp $45, %ecx
- je L(45bytes)
- cmp $46, %ecx
- je L(46bytes)
- jmp L(47bytes)
-
- .p2align 4
-L(44bytes):
- movl -44(%rdi), %eax
- movl -44(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(40bytes):
- movl -40(%rdi), %eax
- movl -40(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(36bytes):
- movl -36(%rdi), %eax
- movl -36(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(32bytes):
- movl -32(%rdi), %eax
- movl -32(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(28bytes):
- movl -28(%rdi), %eax
- movl -28(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(24bytes):
- movl -24(%rdi), %eax
- movl -24(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(20bytes):
- movl -20(%rdi), %eax
- movl -20(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(16bytes):
- movl -16(%rdi), %eax
- movl -16(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(12bytes):
- movl -12(%rdi), %eax
- movl -12(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(8bytes):
- movl -8(%rdi), %eax
- movl -8(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(4bytes):
- movl -4(%rdi), %eax
- movl -4(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(0bytes):
- xor %eax, %eax
- ret
-# else
- .p2align 4
-L(44bytes):
- movl -44(%rdi), %eax
- cmp -44(%rsi), %eax
- jne L(find_diff)
-L(40bytes):
- movl -40(%rdi), %eax
- cmp -40(%rsi), %eax
- jne L(find_diff)
-L(36bytes):
- movl -36(%rdi), %eax
- cmp -36(%rsi), %eax
- jne L(find_diff)
-L(32bytes):
- movl -32(%rdi), %eax
- cmp -32(%rsi), %eax
- jne L(find_diff)
-L(28bytes):
- movl -28(%rdi), %eax
- cmp -28(%rsi), %eax
- jne L(find_diff)
-L(24bytes):
- movl -24(%rdi), %eax
- cmp -24(%rsi), %eax
- jne L(find_diff)
-L(20bytes):
- movl -20(%rdi), %eax
- cmp -20(%rsi), %eax
- jne L(find_diff)
-L(16bytes):
- movl -16(%rdi), %eax
- cmp -16(%rsi), %eax
- jne L(find_diff)
-L(12bytes):
- movl -12(%rdi), %eax
- cmp -12(%rsi), %eax
- jne L(find_diff)
-L(8bytes):
- movl -8(%rdi), %eax
- cmp -8(%rsi), %eax
- jne L(find_diff)
-L(4bytes):
- movl -4(%rdi), %eax
- cmp -4(%rsi), %eax
- jne L(find_diff)
-L(0bytes):
- xor %eax, %eax
- ret
-# endif
-
-# ifndef USE_AS_WMEMCMP
- .p2align 4
-L(45bytes):
- movl -45(%rdi), %eax
- movl -45(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(41bytes):
- movl -41(%rdi), %eax
- movl -41(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(37bytes):
- movl -37(%rdi), %eax
- movl -37(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(33bytes):
- movl -33(%rdi), %eax
- movl -33(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(29bytes):
- movl -29(%rdi), %eax
- movl -29(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(25bytes):
- movl -25(%rdi), %eax
- movl -25(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(21bytes):
- movl -21(%rdi), %eax
- movl -21(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(17bytes):
- movl -17(%rdi), %eax
- movl -17(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(13bytes):
- movl -13(%rdi), %eax
- movl -13(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(9bytes):
- movl -9(%rdi), %eax
- movl -9(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(5bytes):
- movl -5(%rdi), %eax
- movl -5(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(1bytes):
- movzbl -1(%rdi), %eax
- cmpb -1(%rsi), %al
- jne L(set)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(46bytes):
- movl -46(%rdi), %eax
- movl -46(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(42bytes):
- movl -42(%rdi), %eax
- movl -42(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(38bytes):
- movl -38(%rdi), %eax
- movl -38(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(34bytes):
- movl -34(%rdi), %eax
- movl -34(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(30bytes):
- movl -30(%rdi), %eax
- movl -30(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(26bytes):
- movl -26(%rdi), %eax
- movl -26(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(22bytes):
- movl -22(%rdi), %eax
- movl -22(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(18bytes):
- movl -18(%rdi), %eax
- movl -18(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(14bytes):
- movl -14(%rdi), %eax
- movl -14(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(10bytes):
- movl -10(%rdi), %eax
- movl -10(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(6bytes):
- movl -6(%rdi), %eax
- movl -6(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(2bytes):
- movzwl -2(%rdi), %eax
- movzwl -2(%rsi), %ecx
- cmpb %cl, %al
- jne L(set)
- cmp %ecx, %eax
- jne L(set)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(47bytes):
- movl -47(%rdi), %eax
- movl -47(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(43bytes):
- movl -43(%rdi), %eax
- movl -43(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(39bytes):
- movl -39(%rdi), %eax
- movl -39(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(35bytes):
- movl -35(%rdi), %eax
- movl -35(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(31bytes):
- movl -31(%rdi), %eax
- movl -31(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(27bytes):
- movl -27(%rdi), %eax
- movl -27(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(23bytes):
- movl -23(%rdi), %eax
- movl -23(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(19bytes):
- movl -19(%rdi), %eax
- movl -19(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(15bytes):
- movl -15(%rdi), %eax
- movl -15(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(11bytes):
- movl -11(%rdi), %eax
- movl -11(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(7bytes):
- movl -7(%rdi), %eax
- movl -7(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(3bytes):
- movzwl -3(%rdi), %eax
- movzwl -3(%rsi), %ecx
- cmpb %cl, %al
- jne L(set)
- cmp %ecx, %eax
- jne L(set)
- movzbl -1(%rdi), %eax
- cmpb -1(%rsi), %al
- jne L(set)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(find_diff):
- cmpb %cl, %al
- jne L(set)
- cmpw %cx, %ax
- jne L(set)
- shr $16, %eax
- shr $16, %ecx
- cmpb %cl, %al
- jne L(set)
-
-/* We get there only if we already know there is a
-difference. */
-
- cmp %ecx, %eax
-L(set):
- sbb %eax, %eax
- sbb $-1, %eax
- ret
-# else
-
-/* for wmemcmp */
- .p2align 4
-L(find_diff):
- mov $1, %eax
- jg L(find_diff_bigger)
- neg %eax
- ret
-
- .p2align 4
-L(find_diff_bigger):
- ret
-# endif
-
- .p2align 4
-L(equal):
- xor %eax, %eax
- ret
-
-END (MEMCMP)
-#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S
deleted file mode 100644
index 0c9804b7e9..0000000000
--- a/sysdeps/x86_64/multiarch/memcmp.S
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Multiple versions of memcmp
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in libc. */
-#if IS_IN (libc)
- .text
-ENTRY(memcmp)
- .type memcmp, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
- jnz 1f
- HAS_ARCH_FEATURE (AVX2_Usable)
- jz 1f
- HAS_CPU_FEATURE (MOVBE)
- jz 1f
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz 1f
- leaq __memcmp_avx2_movbe(%rip), %rax
- ret
-
-1: HAS_CPU_FEATURE (SSSE3)
- jnz 2f
- leaq __memcmp_sse2(%rip), %rax
- ret
-
-2: HAS_CPU_FEATURE (SSE4_1)
- jz 3f
- leaq __memcmp_sse4_1(%rip), %rax
- ret
-
-3: leaq __memcmp_ssse3(%rip), %rax
- ret
-
-END(memcmp)
-
-# undef ENTRY
-# define ENTRY(name) \
- .type __memcmp_sse2, @function; \
- .p2align 4; \
- .globl __memcmp_sse2; \
- .hidden __memcmp_sse2; \
- __memcmp_sse2: cfi_startproc; \
- CALL_MCOUNT
-# undef END
-# define END(name) \
- cfi_endproc; .size __memcmp_sse2, .-__memcmp_sse2
-
-# ifdef SHARED
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal memcmp calls through a PLT.
- The speedup we get from using SSE4 instructions is likely eaten away
- by the indirect call in the PLT. */
-# define libc_hidden_builtin_def(name) \
- .globl __GI_memcmp; __GI_memcmp = __memcmp_sse2
-# endif
-#endif
-
-#include "../memcmp.S"
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
deleted file mode 100644
index 4e060a27fd..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ /dev/null
@@ -1,3180 +0,0 @@
-/* memcpy with SSSE3 and REP string
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#if IS_IN (libc) \
- && (defined SHARED \
- || defined USE_AS_MEMMOVE \
- || !defined USE_MULTIARCH)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY __memcpy_ssse3_back
-# define MEMCPY_CHK __memcpy_chk_ssse3_back
-# define MEMPCPY __mempcpy_ssse3_back
-# define MEMPCPY_CHK __mempcpy_chk_ssse3_back
-#endif
-
-#define JMPTBL(I, B) I - B
-
-/* Branch to an entry in a jump table. TABLE is a jump table with
- relative offsets. INDEX is a register contains the index into the
- jump table. SCALE is the scale of INDEX. */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- lea TABLE(%rip), %r11; \
- movslq (%r11, INDEX, SCALE), INDEX; \
- lea (%r11, INDEX), INDEX; \
- jmp *INDEX; \
- ud2
-
- .section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
- movq %rdi, %rax
- addq %rdx, %rax
- jmp L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
- mov %rdi, %rax
-#ifdef USE_AS_MEMPCPY
- add %rdx, %rax
-#endif
-
-#ifdef USE_AS_MEMMOVE
- cmp %rsi, %rdi
- jb L(copy_forward)
- je L(bwd_write_0bytes)
- cmp $144, %rdx
- jae L(copy_backward)
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-L(copy_forward):
-#endif
-L(start):
- cmp $144, %rdx
- jae L(144bytesormore)
-
-L(fwd_write_less32bytes):
-#ifndef USE_AS_MEMMOVE
- cmp %dil, %sil
- jbe L(bk_write)
-#endif
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-#ifndef USE_AS_MEMMOVE
-L(bk_write):
-
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-#endif
-
- .p2align 4
-L(144bytesormore):
-
-#ifndef USE_AS_MEMMOVE
- cmp %dil, %sil
- jle L(copy_backward)
-#endif
- movdqu (%rsi), %xmm0
- mov %rdi, %r8
- and $-16, %rdi
- add $16, %rdi
- mov %rdi, %r9
- sub %r8, %r9
- sub %r9, %rdx
- add %r9, %rsi
- mov %rsi, %r9
- and $0xf, %r9
- jz L(shl_0)
-#ifdef DATA_CACHE_SIZE
- mov $DATA_CACHE_SIZE, %RCX_LP
-#else
- mov __x86_data_cache_size(%rip), %RCX_LP
-#endif
- cmp %rcx, %rdx
- jae L(gobble_mem_fwd)
- lea L(shl_table_fwd)(%rip), %r11
- sub $0x80, %rdx
- movslq (%r11, %r9, 4), %r9
- add %r11, %r9
- jmp *%r9
- ud2
-
- .p2align 4
-L(copy_backward):
-#ifdef DATA_CACHE_SIZE
- mov $DATA_CACHE_SIZE, %RCX_LP
-#else
- mov __x86_data_cache_size(%rip), %RCX_LP
-#endif
- shl $1, %rcx
- cmp %rcx, %rdx
- ja L(gobble_mem_bwd)
-
- add %rdx, %rdi
- add %rdx, %rsi
- movdqu -16(%rsi), %xmm0
- lea -16(%rdi), %r8
- mov %rdi, %r9
- and $0xf, %r9
- xor %r9, %rdi
- sub %r9, %rsi
- sub %r9, %rdx
- mov %rsi, %r9
- and $0xf, %r9
- jz L(shl_0_bwd)
- lea L(shl_table_bwd)(%rip), %r11
- sub $0x80, %rdx
- movslq (%r11, %r9, 4), %r9
- add %r11, %r9
- jmp *%r9
- ud2
-
- .p2align 4
-L(shl_0):
-
- mov %rdx, %r9
- shr $8, %r9
- add %rdx, %r9
-#ifdef DATA_CACHE_SIZE
- cmp $DATA_CACHE_SIZE_HALF, %R9_LP
-#else
- cmp __x86_data_cache_size_half(%rip), %R9_LP
-#endif
- jae L(gobble_mem_fwd)
- sub $0x80, %rdx
- .p2align 4
-L(shl_0_loop):
- movdqa (%rsi), %xmm1
- movdqa %xmm1, (%rdi)
- movaps 0x10(%rsi), %xmm2
- movaps %xmm2, 0x10(%rdi)
- movaps 0x20(%rsi), %xmm3
- movaps %xmm3, 0x20(%rdi)
- movaps 0x30(%rsi), %xmm4
- movaps %xmm4, 0x30(%rdi)
- movaps 0x40(%rsi), %xmm1
- movaps %xmm1, 0x40(%rdi)
- movaps 0x50(%rsi), %xmm2
- movaps %xmm2, 0x50(%rdi)
- movaps 0x60(%rsi), %xmm3
- movaps %xmm3, 0x60(%rdi)
- movaps 0x70(%rsi), %xmm4
- movaps %xmm4, 0x70(%rdi)
- sub $0x80, %rdx
- lea 0x80(%rsi), %rsi
- lea 0x80(%rdi), %rdi
- jae L(shl_0_loop)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_0_bwd):
- sub $0x80, %rdx
-L(copy_backward_loop):
- movaps -0x10(%rsi), %xmm1
- movaps %xmm1, -0x10(%rdi)
- movaps -0x20(%rsi), %xmm2
- movaps %xmm2, -0x20(%rdi)
- movaps -0x30(%rsi), %xmm3
- movaps %xmm3, -0x30(%rdi)
- movaps -0x40(%rsi), %xmm4
- movaps %xmm4, -0x40(%rdi)
- movaps -0x50(%rsi), %xmm5
- movaps %xmm5, -0x50(%rdi)
- movaps -0x60(%rsi), %xmm5
- movaps %xmm5, -0x60(%rdi)
- movaps -0x70(%rsi), %xmm5
- movaps %xmm5, -0x70(%rdi)
- movaps -0x80(%rsi), %xmm5
- movaps %xmm5, -0x80(%rdi)
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(copy_backward_loop)
-
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_1):
- sub $0x80, %rdx
- movaps -0x01(%rsi), %xmm1
- movaps 0x0f(%rsi), %xmm2
- movaps 0x1f(%rsi), %xmm3
- movaps 0x2f(%rsi), %xmm4
- movaps 0x3f(%rsi), %xmm5
- movaps 0x4f(%rsi), %xmm6
- movaps 0x5f(%rsi), %xmm7
- movaps 0x6f(%rsi), %xmm8
- movaps 0x7f(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $1, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $1, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $1, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $1, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $1, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $1, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $1, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_1)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_1_bwd):
- movaps -0x01(%rsi), %xmm1
-
- movaps -0x11(%rsi), %xmm2
- palignr $1, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x21(%rsi), %xmm3
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x31(%rsi), %xmm4
- palignr $1, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x41(%rsi), %xmm5
- palignr $1, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x51(%rsi), %xmm6
- palignr $1, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x61(%rsi), %xmm7
- palignr $1, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x71(%rsi), %xmm8
- palignr $1, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x81(%rsi), %xmm9
- palignr $1, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_1_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_2):
- sub $0x80, %rdx
- movaps -0x02(%rsi), %xmm1
- movaps 0x0e(%rsi), %xmm2
- movaps 0x1e(%rsi), %xmm3
- movaps 0x2e(%rsi), %xmm4
- movaps 0x3e(%rsi), %xmm5
- movaps 0x4e(%rsi), %xmm6
- movaps 0x5e(%rsi), %xmm7
- movaps 0x6e(%rsi), %xmm8
- movaps 0x7e(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $2, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $2, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $2, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $2, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $2, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $2, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $2, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_2)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_2_bwd):
- movaps -0x02(%rsi), %xmm1
-
- movaps -0x12(%rsi), %xmm2
- palignr $2, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x22(%rsi), %xmm3
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x32(%rsi), %xmm4
- palignr $2, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x42(%rsi), %xmm5
- palignr $2, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x52(%rsi), %xmm6
- palignr $2, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x62(%rsi), %xmm7
- palignr $2, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x72(%rsi), %xmm8
- palignr $2, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x82(%rsi), %xmm9
- palignr $2, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_2_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_3):
- sub $0x80, %rdx
- movaps -0x03(%rsi), %xmm1
- movaps 0x0d(%rsi), %xmm2
- movaps 0x1d(%rsi), %xmm3
- movaps 0x2d(%rsi), %xmm4
- movaps 0x3d(%rsi), %xmm5
- movaps 0x4d(%rsi), %xmm6
- movaps 0x5d(%rsi), %xmm7
- movaps 0x6d(%rsi), %xmm8
- movaps 0x7d(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $3, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $3, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $3, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $3, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $3, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $3, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $3, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_3)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_3_bwd):
- movaps -0x03(%rsi), %xmm1
-
- movaps -0x13(%rsi), %xmm2
- palignr $3, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x23(%rsi), %xmm3
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x33(%rsi), %xmm4
- palignr $3, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x43(%rsi), %xmm5
- palignr $3, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x53(%rsi), %xmm6
- palignr $3, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x63(%rsi), %xmm7
- palignr $3, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x73(%rsi), %xmm8
- palignr $3, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x83(%rsi), %xmm9
- palignr $3, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_3_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_4):
- sub $0x80, %rdx
- movaps -0x04(%rsi), %xmm1
- movaps 0x0c(%rsi), %xmm2
- movaps 0x1c(%rsi), %xmm3
- movaps 0x2c(%rsi), %xmm4
- movaps 0x3c(%rsi), %xmm5
- movaps 0x4c(%rsi), %xmm6
- movaps 0x5c(%rsi), %xmm7
- movaps 0x6c(%rsi), %xmm8
- movaps 0x7c(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $4, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $4, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $4, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $4, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $4, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $4, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $4, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_4)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_4_bwd):
- movaps -0x04(%rsi), %xmm1
-
- movaps -0x14(%rsi), %xmm2
- palignr $4, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x24(%rsi), %xmm3
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x34(%rsi), %xmm4
- palignr $4, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x44(%rsi), %xmm5
- palignr $4, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x54(%rsi), %xmm6
- palignr $4, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x64(%rsi), %xmm7
- palignr $4, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x74(%rsi), %xmm8
- palignr $4, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x84(%rsi), %xmm9
- palignr $4, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_4_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_5):
- sub $0x80, %rdx
- movaps -0x05(%rsi), %xmm1
- movaps 0x0b(%rsi), %xmm2
- movaps 0x1b(%rsi), %xmm3
- movaps 0x2b(%rsi), %xmm4
- movaps 0x3b(%rsi), %xmm5
- movaps 0x4b(%rsi), %xmm6
- movaps 0x5b(%rsi), %xmm7
- movaps 0x6b(%rsi), %xmm8
- movaps 0x7b(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $5, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $5, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $5, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $5, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $5, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $5, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $5, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_5)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_5_bwd):
- movaps -0x05(%rsi), %xmm1
-
- movaps -0x15(%rsi), %xmm2
- palignr $5, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x25(%rsi), %xmm3
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x35(%rsi), %xmm4
- palignr $5, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x45(%rsi), %xmm5
- palignr $5, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x55(%rsi), %xmm6
- palignr $5, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x65(%rsi), %xmm7
- palignr $5, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x75(%rsi), %xmm8
- palignr $5, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x85(%rsi), %xmm9
- palignr $5, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_5_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_6):
- sub $0x80, %rdx
- movaps -0x06(%rsi), %xmm1
- movaps 0x0a(%rsi), %xmm2
- movaps 0x1a(%rsi), %xmm3
- movaps 0x2a(%rsi), %xmm4
- movaps 0x3a(%rsi), %xmm5
- movaps 0x4a(%rsi), %xmm6
- movaps 0x5a(%rsi), %xmm7
- movaps 0x6a(%rsi), %xmm8
- movaps 0x7a(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $6, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $6, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $6, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $6, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $6, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $6, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $6, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_6)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_6_bwd):
- movaps -0x06(%rsi), %xmm1
-
- movaps -0x16(%rsi), %xmm2
- palignr $6, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x26(%rsi), %xmm3
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x36(%rsi), %xmm4
- palignr $6, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x46(%rsi), %xmm5
- palignr $6, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x56(%rsi), %xmm6
- palignr $6, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x66(%rsi), %xmm7
- palignr $6, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x76(%rsi), %xmm8
- palignr $6, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x86(%rsi), %xmm9
- palignr $6, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_6_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_7):
- sub $0x80, %rdx
- movaps -0x07(%rsi), %xmm1
- movaps 0x09(%rsi), %xmm2
- movaps 0x19(%rsi), %xmm3
- movaps 0x29(%rsi), %xmm4
- movaps 0x39(%rsi), %xmm5
- movaps 0x49(%rsi), %xmm6
- movaps 0x59(%rsi), %xmm7
- movaps 0x69(%rsi), %xmm8
- movaps 0x79(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $7, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $7, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $7, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $7, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $7, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $7, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $7, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_7)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_7_bwd):
- movaps -0x07(%rsi), %xmm1
-
- movaps -0x17(%rsi), %xmm2
- palignr $7, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x27(%rsi), %xmm3
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x37(%rsi), %xmm4
- palignr $7, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x47(%rsi), %xmm5
- palignr $7, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x57(%rsi), %xmm6
- palignr $7, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x67(%rsi), %xmm7
- palignr $7, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x77(%rsi), %xmm8
- palignr $7, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x87(%rsi), %xmm9
- palignr $7, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_7_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_8):
- sub $0x80, %rdx
- movaps -0x08(%rsi), %xmm1
- movaps 0x08(%rsi), %xmm2
- movaps 0x18(%rsi), %xmm3
- movaps 0x28(%rsi), %xmm4
- movaps 0x38(%rsi), %xmm5
- movaps 0x48(%rsi), %xmm6
- movaps 0x58(%rsi), %xmm7
- movaps 0x68(%rsi), %xmm8
- movaps 0x78(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $8, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $8, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $8, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $8, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $8, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $8, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $8, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_8)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_8_bwd):
- movaps -0x08(%rsi), %xmm1
-
- movaps -0x18(%rsi), %xmm2
- palignr $8, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x28(%rsi), %xmm3
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x38(%rsi), %xmm4
- palignr $8, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x48(%rsi), %xmm5
- palignr $8, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x58(%rsi), %xmm6
- palignr $8, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x68(%rsi), %xmm7
- palignr $8, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x78(%rsi), %xmm8
- palignr $8, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x88(%rsi), %xmm9
- palignr $8, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_8_bwd)
-L(shl_8_end_bwd):
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_9):
- sub $0x80, %rdx
- movaps -0x09(%rsi), %xmm1
- movaps 0x07(%rsi), %xmm2
- movaps 0x17(%rsi), %xmm3
- movaps 0x27(%rsi), %xmm4
- movaps 0x37(%rsi), %xmm5
- movaps 0x47(%rsi), %xmm6
- movaps 0x57(%rsi), %xmm7
- movaps 0x67(%rsi), %xmm8
- movaps 0x77(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $9, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $9, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $9, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $9, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $9, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $9, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $9, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_9)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_9_bwd):
- movaps -0x09(%rsi), %xmm1
-
- movaps -0x19(%rsi), %xmm2
- palignr $9, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x29(%rsi), %xmm3
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x39(%rsi), %xmm4
- palignr $9, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x49(%rsi), %xmm5
- palignr $9, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x59(%rsi), %xmm6
- palignr $9, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x69(%rsi), %xmm7
- palignr $9, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x79(%rsi), %xmm8
- palignr $9, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x89(%rsi), %xmm9
- palignr $9, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_9_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_10):
- sub $0x80, %rdx
- movaps -0x0a(%rsi), %xmm1
- movaps 0x06(%rsi), %xmm2
- movaps 0x16(%rsi), %xmm3
- movaps 0x26(%rsi), %xmm4
- movaps 0x36(%rsi), %xmm5
- movaps 0x46(%rsi), %xmm6
- movaps 0x56(%rsi), %xmm7
- movaps 0x66(%rsi), %xmm8
- movaps 0x76(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $10, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $10, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $10, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $10, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $10, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $10, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $10, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_10)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_10_bwd):
- movaps -0x0a(%rsi), %xmm1
-
- movaps -0x1a(%rsi), %xmm2
- palignr $10, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2a(%rsi), %xmm3
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3a(%rsi), %xmm4
- palignr $10, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4a(%rsi), %xmm5
- palignr $10, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5a(%rsi), %xmm6
- palignr $10, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6a(%rsi), %xmm7
- palignr $10, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7a(%rsi), %xmm8
- palignr $10, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8a(%rsi), %xmm9
- palignr $10, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_10_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_11):
- sub $0x80, %rdx
- movaps -0x0b(%rsi), %xmm1
- movaps 0x05(%rsi), %xmm2
- movaps 0x15(%rsi), %xmm3
- movaps 0x25(%rsi), %xmm4
- movaps 0x35(%rsi), %xmm5
- movaps 0x45(%rsi), %xmm6
- movaps 0x55(%rsi), %xmm7
- movaps 0x65(%rsi), %xmm8
- movaps 0x75(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $11, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $11, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $11, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $11, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $11, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $11, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $11, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_11)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_11_bwd):
- movaps -0x0b(%rsi), %xmm1
-
- movaps -0x1b(%rsi), %xmm2
- palignr $11, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2b(%rsi), %xmm3
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3b(%rsi), %xmm4
- palignr $11, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4b(%rsi), %xmm5
- palignr $11, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5b(%rsi), %xmm6
- palignr $11, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6b(%rsi), %xmm7
- palignr $11, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7b(%rsi), %xmm8
- palignr $11, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8b(%rsi), %xmm9
- palignr $11, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_11_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_12):
- sub $0x80, %rdx
- movdqa -0x0c(%rsi), %xmm1
- movaps 0x04(%rsi), %xmm2
- movaps 0x14(%rsi), %xmm3
- movaps 0x24(%rsi), %xmm4
- movaps 0x34(%rsi), %xmm5
- movaps 0x44(%rsi), %xmm6
- movaps 0x54(%rsi), %xmm7
- movaps 0x64(%rsi), %xmm8
- movaps 0x74(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $12, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $12, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $12, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $12, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $12, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $12, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $12, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
-
- lea 0x80(%rdi), %rdi
- jae L(shl_12)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_12_bwd):
- movaps -0x0c(%rsi), %xmm1
-
- movaps -0x1c(%rsi), %xmm2
- palignr $12, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2c(%rsi), %xmm3
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3c(%rsi), %xmm4
- palignr $12, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4c(%rsi), %xmm5
- palignr $12, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5c(%rsi), %xmm6
- palignr $12, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6c(%rsi), %xmm7
- palignr $12, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7c(%rsi), %xmm8
- palignr $12, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8c(%rsi), %xmm9
- palignr $12, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_12_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_13):
- sub $0x80, %rdx
- movaps -0x0d(%rsi), %xmm1
- movaps 0x03(%rsi), %xmm2
- movaps 0x13(%rsi), %xmm3
- movaps 0x23(%rsi), %xmm4
- movaps 0x33(%rsi), %xmm5
- movaps 0x43(%rsi), %xmm6
- movaps 0x53(%rsi), %xmm7
- movaps 0x63(%rsi), %xmm8
- movaps 0x73(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $13, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $13, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $13, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $13, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $13, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $13, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $13, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_13)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_13_bwd):
- movaps -0x0d(%rsi), %xmm1
-
- movaps -0x1d(%rsi), %xmm2
- palignr $13, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2d(%rsi), %xmm3
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3d(%rsi), %xmm4
- palignr $13, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4d(%rsi), %xmm5
- palignr $13, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5d(%rsi), %xmm6
- palignr $13, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6d(%rsi), %xmm7
- palignr $13, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7d(%rsi), %xmm8
- palignr $13, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8d(%rsi), %xmm9
- palignr $13, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_13_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_14):
- sub $0x80, %rdx
- movaps -0x0e(%rsi), %xmm1
- movaps 0x02(%rsi), %xmm2
- movaps 0x12(%rsi), %xmm3
- movaps 0x22(%rsi), %xmm4
- movaps 0x32(%rsi), %xmm5
- movaps 0x42(%rsi), %xmm6
- movaps 0x52(%rsi), %xmm7
- movaps 0x62(%rsi), %xmm8
- movaps 0x72(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $14, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $14, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $14, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $14, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $14, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $14, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $14, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_14)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_14_bwd):
- movaps -0x0e(%rsi), %xmm1
-
- movaps -0x1e(%rsi), %xmm2
- palignr $14, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2e(%rsi), %xmm3
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3e(%rsi), %xmm4
- palignr $14, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4e(%rsi), %xmm5
- palignr $14, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5e(%rsi), %xmm6
- palignr $14, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6e(%rsi), %xmm7
- palignr $14, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7e(%rsi), %xmm8
- palignr $14, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8e(%rsi), %xmm9
- palignr $14, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_14_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_15):
- sub $0x80, %rdx
- movaps -0x0f(%rsi), %xmm1
- movaps 0x01(%rsi), %xmm2
- movaps 0x11(%rsi), %xmm3
- movaps 0x21(%rsi), %xmm4
- movaps 0x31(%rsi), %xmm5
- movaps 0x41(%rsi), %xmm6
- movaps 0x51(%rsi), %xmm7
- movaps 0x61(%rsi), %xmm8
- movaps 0x71(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $15, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $15, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $15, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $15, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $15, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $15, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $15, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_15)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_15_bwd):
- movaps -0x0f(%rsi), %xmm1
-
- movaps -0x1f(%rsi), %xmm2
- palignr $15, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2f(%rsi), %xmm3
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3f(%rsi), %xmm4
- palignr $15, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4f(%rsi), %xmm5
- palignr $15, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5f(%rsi), %xmm6
- palignr $15, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6f(%rsi), %xmm7
- palignr $15, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7f(%rsi), %xmm8
- palignr $15, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8f(%rsi), %xmm9
- palignr $15, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_15_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(gobble_mem_fwd):
- movdqu (%rsi), %xmm1
- movdqu %xmm0, (%r8)
- movdqa %xmm1, (%rdi)
- sub $16, %rdx
- add $16, %rsi
- add $16, %rdi
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-#ifdef USE_AS_MEMMOVE
- mov %rsi, %r9
- sub %rdi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_fwd)
- cmp %rcx, %r9
- jbe L(ll_cache_copy_fwd_start)
-L(memmove_is_memcpy_fwd):
-#endif
- cmp %rcx, %rdx
- ja L(bigger_in_fwd)
- mov %rdx, %rcx
-L(bigger_in_fwd):
- sub %rcx, %rdx
- cmp $0x1000, %rdx
- jbe L(ll_cache_copy_fwd)
-
- mov %rcx, %r9
- shl $3, %r9
- cmp %r9, %rdx
- jbe L(2steps_copy_fwd)
- add %rcx, %rdx
- xor %rcx, %rcx
-L(2steps_copy_fwd):
- sub $0x80, %rdx
-L(gobble_mem_fwd_loop):
- sub $0x80, %rdx
- prefetcht0 0x200(%rsi)
- prefetcht0 0x300(%rsi)
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- lfence
- movntdq %xmm0, (%rdi)
- movntdq %xmm1, 0x10(%rdi)
- movntdq %xmm2, 0x20(%rdi)
- movntdq %xmm3, 0x30(%rdi)
- movntdq %xmm4, 0x40(%rdi)
- movntdq %xmm5, 0x50(%rdi)
- movntdq %xmm6, 0x60(%rdi)
- movntdq %xmm7, 0x70(%rdi)
- lea 0x80(%rsi), %rsi
- lea 0x80(%rdi), %rdi
- jae L(gobble_mem_fwd_loop)
- sfence
- cmp $0x80, %rcx
- jb L(gobble_mem_fwd_end)
- add $0x80, %rdx
-L(ll_cache_copy_fwd):
- add %rcx, %rdx
-L(ll_cache_copy_fwd_start):
- sub $0x80, %rdx
-L(gobble_ll_loop_fwd):
- prefetchnta 0x1c0(%rsi)
- prefetchnta 0x280(%rsi)
- prefetchnta 0x1c0(%rdi)
- prefetchnta 0x280(%rdi)
- sub $0x80, %rdx
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
- movdqa %xmm2, 0x20(%rdi)
- movdqa %xmm3, 0x30(%rdi)
- movdqa %xmm4, 0x40(%rdi)
- movdqa %xmm5, 0x50(%rdi)
- movdqa %xmm6, 0x60(%rdi)
- movdqa %xmm7, 0x70(%rdi)
- lea 0x80(%rsi), %rsi
- lea 0x80(%rdi), %rdi
- jae L(gobble_ll_loop_fwd)
-L(gobble_mem_fwd_end):
- add $0x80, %rdx
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(gobble_mem_bwd):
- add %rdx, %rsi
- add %rdx, %rdi
-
- movdqu -16(%rsi), %xmm0
- lea -16(%rdi), %r8
- mov %rdi, %r9
- and $-16, %rdi
- sub %rdi, %r9
- sub %r9, %rsi
- sub %r9, %rdx
-
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-#ifdef USE_AS_MEMMOVE
- mov %rdi, %r9
- sub %rsi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_bwd)
- cmp %rcx, %r9
- jbe L(ll_cache_copy_bwd_start)
-L(memmove_is_memcpy_bwd):
-#endif
- cmp %rcx, %rdx
- ja L(bigger)
- mov %rdx, %rcx
-L(bigger):
- sub %rcx, %rdx
- cmp $0x1000, %rdx
- jbe L(ll_cache_copy)
-
- mov %rcx, %r9
- shl $3, %r9
- cmp %r9, %rdx
- jbe L(2steps_copy)
- add %rcx, %rdx
- xor %rcx, %rcx
-L(2steps_copy):
- sub $0x80, %rdx
-L(gobble_mem_bwd_loop):
- sub $0x80, %rdx
- prefetcht0 -0x200(%rsi)
- prefetcht0 -0x300(%rsi)
- movdqu -0x10(%rsi), %xmm1
- movdqu -0x20(%rsi), %xmm2
- movdqu -0x30(%rsi), %xmm3
- movdqu -0x40(%rsi), %xmm4
- movdqu -0x50(%rsi), %xmm5
- movdqu -0x60(%rsi), %xmm6
- movdqu -0x70(%rsi), %xmm7
- movdqu -0x80(%rsi), %xmm8
- lfence
- movntdq %xmm1, -0x10(%rdi)
- movntdq %xmm2, -0x20(%rdi)
- movntdq %xmm3, -0x30(%rdi)
- movntdq %xmm4, -0x40(%rdi)
- movntdq %xmm5, -0x50(%rdi)
- movntdq %xmm6, -0x60(%rdi)
- movntdq %xmm7, -0x70(%rdi)
- movntdq %xmm8, -0x80(%rdi)
- lea -0x80(%rsi), %rsi
- lea -0x80(%rdi), %rdi
- jae L(gobble_mem_bwd_loop)
- sfence
- cmp $0x80, %rcx
- jb L(gobble_mem_bwd_end)
- add $0x80, %rdx
-L(ll_cache_copy):
- add %rcx, %rdx
-L(ll_cache_copy_bwd_start):
- sub $0x80, %rdx
-L(gobble_ll_loop):
- prefetchnta -0x1c0(%rsi)
- prefetchnta -0x280(%rsi)
- prefetchnta -0x1c0(%rdi)
- prefetchnta -0x280(%rdi)
- sub $0x80, %rdx
- movdqu -0x10(%rsi), %xmm1
- movdqu -0x20(%rsi), %xmm2
- movdqu -0x30(%rsi), %xmm3
- movdqu -0x40(%rsi), %xmm4
- movdqu -0x50(%rsi), %xmm5
- movdqu -0x60(%rsi), %xmm6
- movdqu -0x70(%rsi), %xmm7
- movdqu -0x80(%rsi), %xmm8
- movdqa %xmm1, -0x10(%rdi)
- movdqa %xmm2, -0x20(%rdi)
- movdqa %xmm3, -0x30(%rdi)
- movdqa %xmm4, -0x40(%rdi)
- movdqa %xmm5, -0x50(%rdi)
- movdqa %xmm6, -0x60(%rdi)
- movdqa %xmm7, -0x70(%rdi)
- movdqa %xmm8, -0x80(%rdi)
- lea -0x80(%rsi), %rsi
- lea -0x80(%rdi), %rdi
- jae L(gobble_ll_loop)
-L(gobble_mem_bwd_end):
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rsi
- sub %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(fwd_write_128bytes):
- lddqu -128(%rsi), %xmm0
- movdqu %xmm0, -128(%rdi)
-L(fwd_write_112bytes):
- lddqu -112(%rsi), %xmm0
- movdqu %xmm0, -112(%rdi)
-L(fwd_write_96bytes):
- lddqu -96(%rsi), %xmm0
- movdqu %xmm0, -96(%rdi)
-L(fwd_write_80bytes):
- lddqu -80(%rsi), %xmm0
- movdqu %xmm0, -80(%rdi)
-L(fwd_write_64bytes):
- lddqu -64(%rsi), %xmm0
- movdqu %xmm0, -64(%rdi)
-L(fwd_write_48bytes):
- lddqu -48(%rsi), %xmm0
- movdqu %xmm0, -48(%rdi)
-L(fwd_write_32bytes):
- lddqu -32(%rsi), %xmm0
- movdqu %xmm0, -32(%rdi)
-L(fwd_write_16bytes):
- lddqu -16(%rsi), %xmm0
- movdqu %xmm0, -16(%rdi)
-L(fwd_write_0bytes):
- ret
-
-
- .p2align 4
-L(fwd_write_143bytes):
- lddqu -143(%rsi), %xmm0
- movdqu %xmm0, -143(%rdi)
-L(fwd_write_127bytes):
- lddqu -127(%rsi), %xmm0
- movdqu %xmm0, -127(%rdi)
-L(fwd_write_111bytes):
- lddqu -111(%rsi), %xmm0
- movdqu %xmm0, -111(%rdi)
-L(fwd_write_95bytes):
- lddqu -95(%rsi), %xmm0
- movdqu %xmm0, -95(%rdi)
-L(fwd_write_79bytes):
- lddqu -79(%rsi), %xmm0
- movdqu %xmm0, -79(%rdi)
-L(fwd_write_63bytes):
- lddqu -63(%rsi), %xmm0
- movdqu %xmm0, -63(%rdi)
-L(fwd_write_47bytes):
- lddqu -47(%rsi), %xmm0
- movdqu %xmm0, -47(%rdi)
-L(fwd_write_31bytes):
- lddqu -31(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -31(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_15bytes):
- mov -15(%rsi), %rdx
- mov -8(%rsi), %rcx
- mov %rdx, -15(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_142bytes):
- lddqu -142(%rsi), %xmm0
- movdqu %xmm0, -142(%rdi)
-L(fwd_write_126bytes):
- lddqu -126(%rsi), %xmm0
- movdqu %xmm0, -126(%rdi)
-L(fwd_write_110bytes):
- lddqu -110(%rsi), %xmm0
- movdqu %xmm0, -110(%rdi)
-L(fwd_write_94bytes):
- lddqu -94(%rsi), %xmm0
- movdqu %xmm0, -94(%rdi)
-L(fwd_write_78bytes):
- lddqu -78(%rsi), %xmm0
- movdqu %xmm0, -78(%rdi)
-L(fwd_write_62bytes):
- lddqu -62(%rsi), %xmm0
- movdqu %xmm0, -62(%rdi)
-L(fwd_write_46bytes):
- lddqu -46(%rsi), %xmm0
- movdqu %xmm0, -46(%rdi)
-L(fwd_write_30bytes):
- lddqu -30(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -30(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_14bytes):
- mov -14(%rsi), %rdx
- mov -8(%rsi), %rcx
- mov %rdx, -14(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_141bytes):
- lddqu -141(%rsi), %xmm0
- movdqu %xmm0, -141(%rdi)
-L(fwd_write_125bytes):
- lddqu -125(%rsi), %xmm0
- movdqu %xmm0, -125(%rdi)
-L(fwd_write_109bytes):
- lddqu -109(%rsi), %xmm0
- movdqu %xmm0, -109(%rdi)
-L(fwd_write_93bytes):
- lddqu -93(%rsi), %xmm0
- movdqu %xmm0, -93(%rdi)
-L(fwd_write_77bytes):
- lddqu -77(%rsi), %xmm0
- movdqu %xmm0, -77(%rdi)
-L(fwd_write_61bytes):
- lddqu -61(%rsi), %xmm0
- movdqu %xmm0, -61(%rdi)
-L(fwd_write_45bytes):
- lddqu -45(%rsi), %xmm0
- movdqu %xmm0, -45(%rdi)
-L(fwd_write_29bytes):
- lddqu -29(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -29(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_13bytes):
- mov -13(%rsi), %rdx
- mov -8(%rsi), %rcx
- mov %rdx, -13(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_140bytes):
- lddqu -140(%rsi), %xmm0
- movdqu %xmm0, -140(%rdi)
-L(fwd_write_124bytes):
- lddqu -124(%rsi), %xmm0
- movdqu %xmm0, -124(%rdi)
-L(fwd_write_108bytes):
- lddqu -108(%rsi), %xmm0
- movdqu %xmm0, -108(%rdi)
-L(fwd_write_92bytes):
- lddqu -92(%rsi), %xmm0
- movdqu %xmm0, -92(%rdi)
-L(fwd_write_76bytes):
- lddqu -76(%rsi), %xmm0
- movdqu %xmm0, -76(%rdi)
-L(fwd_write_60bytes):
- lddqu -60(%rsi), %xmm0
- movdqu %xmm0, -60(%rdi)
-L(fwd_write_44bytes):
- lddqu -44(%rsi), %xmm0
- movdqu %xmm0, -44(%rdi)
-L(fwd_write_28bytes):
- lddqu -28(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -28(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_12bytes):
- mov -12(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -12(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_139bytes):
- lddqu -139(%rsi), %xmm0
- movdqu %xmm0, -139(%rdi)
-L(fwd_write_123bytes):
- lddqu -123(%rsi), %xmm0
- movdqu %xmm0, -123(%rdi)
-L(fwd_write_107bytes):
- lddqu -107(%rsi), %xmm0
- movdqu %xmm0, -107(%rdi)
-L(fwd_write_91bytes):
- lddqu -91(%rsi), %xmm0
- movdqu %xmm0, -91(%rdi)
-L(fwd_write_75bytes):
- lddqu -75(%rsi), %xmm0
- movdqu %xmm0, -75(%rdi)
-L(fwd_write_59bytes):
- lddqu -59(%rsi), %xmm0
- movdqu %xmm0, -59(%rdi)
-L(fwd_write_43bytes):
- lddqu -43(%rsi), %xmm0
- movdqu %xmm0, -43(%rdi)
-L(fwd_write_27bytes):
- lddqu -27(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -27(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_11bytes):
- mov -11(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -11(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_138bytes):
- lddqu -138(%rsi), %xmm0
- movdqu %xmm0, -138(%rdi)
-L(fwd_write_122bytes):
- lddqu -122(%rsi), %xmm0
- movdqu %xmm0, -122(%rdi)
-L(fwd_write_106bytes):
- lddqu -106(%rsi), %xmm0
- movdqu %xmm0, -106(%rdi)
-L(fwd_write_90bytes):
- lddqu -90(%rsi), %xmm0
- movdqu %xmm0, -90(%rdi)
-L(fwd_write_74bytes):
- lddqu -74(%rsi), %xmm0
- movdqu %xmm0, -74(%rdi)
-L(fwd_write_58bytes):
- lddqu -58(%rsi), %xmm0
- movdqu %xmm0, -58(%rdi)
-L(fwd_write_42bytes):
- lddqu -42(%rsi), %xmm0
- movdqu %xmm0, -42(%rdi)
-L(fwd_write_26bytes):
- lddqu -26(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -26(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_10bytes):
- mov -10(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -10(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_137bytes):
- lddqu -137(%rsi), %xmm0
- movdqu %xmm0, -137(%rdi)
-L(fwd_write_121bytes):
- lddqu -121(%rsi), %xmm0
- movdqu %xmm0, -121(%rdi)
-L(fwd_write_105bytes):
- lddqu -105(%rsi), %xmm0
- movdqu %xmm0, -105(%rdi)
-L(fwd_write_89bytes):
- lddqu -89(%rsi), %xmm0
- movdqu %xmm0, -89(%rdi)
-L(fwd_write_73bytes):
- lddqu -73(%rsi), %xmm0
- movdqu %xmm0, -73(%rdi)
-L(fwd_write_57bytes):
- lddqu -57(%rsi), %xmm0
- movdqu %xmm0, -57(%rdi)
-L(fwd_write_41bytes):
- lddqu -41(%rsi), %xmm0
- movdqu %xmm0, -41(%rdi)
-L(fwd_write_25bytes):
- lddqu -25(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -25(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_9bytes):
- mov -9(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -9(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_136bytes):
- lddqu -136(%rsi), %xmm0
- movdqu %xmm0, -136(%rdi)
-L(fwd_write_120bytes):
- lddqu -120(%rsi), %xmm0
- movdqu %xmm0, -120(%rdi)
-L(fwd_write_104bytes):
- lddqu -104(%rsi), %xmm0
- movdqu %xmm0, -104(%rdi)
-L(fwd_write_88bytes):
- lddqu -88(%rsi), %xmm0
- movdqu %xmm0, -88(%rdi)
-L(fwd_write_72bytes):
- lddqu -72(%rsi), %xmm0
- movdqu %xmm0, -72(%rdi)
-L(fwd_write_56bytes):
- lddqu -56(%rsi), %xmm0
- movdqu %xmm0, -56(%rdi)
-L(fwd_write_40bytes):
- lddqu -40(%rsi), %xmm0
- movdqu %xmm0, -40(%rdi)
-L(fwd_write_24bytes):
- lddqu -24(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -24(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_8bytes):
- mov -8(%rsi), %rdx
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_135bytes):
- lddqu -135(%rsi), %xmm0
- movdqu %xmm0, -135(%rdi)
-L(fwd_write_119bytes):
- lddqu -119(%rsi), %xmm0
- movdqu %xmm0, -119(%rdi)
-L(fwd_write_103bytes):
- lddqu -103(%rsi), %xmm0
- movdqu %xmm0, -103(%rdi)
-L(fwd_write_87bytes):
- lddqu -87(%rsi), %xmm0
- movdqu %xmm0, -87(%rdi)
-L(fwd_write_71bytes):
- lddqu -71(%rsi), %xmm0
- movdqu %xmm0, -71(%rdi)
-L(fwd_write_55bytes):
- lddqu -55(%rsi), %xmm0
- movdqu %xmm0, -55(%rdi)
-L(fwd_write_39bytes):
- lddqu -39(%rsi), %xmm0
- movdqu %xmm0, -39(%rdi)
-L(fwd_write_23bytes):
- lddqu -23(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -23(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_7bytes):
- mov -7(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -7(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_134bytes):
- lddqu -134(%rsi), %xmm0
- movdqu %xmm0, -134(%rdi)
-L(fwd_write_118bytes):
- lddqu -118(%rsi), %xmm0
- movdqu %xmm0, -118(%rdi)
-L(fwd_write_102bytes):
- lddqu -102(%rsi), %xmm0
- movdqu %xmm0, -102(%rdi)
-L(fwd_write_86bytes):
- lddqu -86(%rsi), %xmm0
- movdqu %xmm0, -86(%rdi)
-L(fwd_write_70bytes):
- lddqu -70(%rsi), %xmm0
- movdqu %xmm0, -70(%rdi)
-L(fwd_write_54bytes):
- lddqu -54(%rsi), %xmm0
- movdqu %xmm0, -54(%rdi)
-L(fwd_write_38bytes):
- lddqu -38(%rsi), %xmm0
- movdqu %xmm0, -38(%rdi)
-L(fwd_write_22bytes):
- lddqu -22(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -22(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_6bytes):
- mov -6(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -6(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_133bytes):
- lddqu -133(%rsi), %xmm0
- movdqu %xmm0, -133(%rdi)
-L(fwd_write_117bytes):
- lddqu -117(%rsi), %xmm0
- movdqu %xmm0, -117(%rdi)
-L(fwd_write_101bytes):
- lddqu -101(%rsi), %xmm0
- movdqu %xmm0, -101(%rdi)
-L(fwd_write_85bytes):
- lddqu -85(%rsi), %xmm0
- movdqu %xmm0, -85(%rdi)
-L(fwd_write_69bytes):
- lddqu -69(%rsi), %xmm0
- movdqu %xmm0, -69(%rdi)
-L(fwd_write_53bytes):
- lddqu -53(%rsi), %xmm0
- movdqu %xmm0, -53(%rdi)
-L(fwd_write_37bytes):
- lddqu -37(%rsi), %xmm0
- movdqu %xmm0, -37(%rdi)
-L(fwd_write_21bytes):
- lddqu -21(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -21(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_5bytes):
- mov -5(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -5(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_132bytes):
- lddqu -132(%rsi), %xmm0
- movdqu %xmm0, -132(%rdi)
-L(fwd_write_116bytes):
- lddqu -116(%rsi), %xmm0
- movdqu %xmm0, -116(%rdi)
-L(fwd_write_100bytes):
- lddqu -100(%rsi), %xmm0
- movdqu %xmm0, -100(%rdi)
-L(fwd_write_84bytes):
- lddqu -84(%rsi), %xmm0
- movdqu %xmm0, -84(%rdi)
-L(fwd_write_68bytes):
- lddqu -68(%rsi), %xmm0
- movdqu %xmm0, -68(%rdi)
-L(fwd_write_52bytes):
- lddqu -52(%rsi), %xmm0
- movdqu %xmm0, -52(%rdi)
-L(fwd_write_36bytes):
- lddqu -36(%rsi), %xmm0
- movdqu %xmm0, -36(%rdi)
-L(fwd_write_20bytes):
- lddqu -20(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -20(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_4bytes):
- mov -4(%rsi), %edx
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_131bytes):
- lddqu -131(%rsi), %xmm0
- movdqu %xmm0, -131(%rdi)
-L(fwd_write_115bytes):
- lddqu -115(%rsi), %xmm0
- movdqu %xmm0, -115(%rdi)
-L(fwd_write_99bytes):
- lddqu -99(%rsi), %xmm0
- movdqu %xmm0, -99(%rdi)
-L(fwd_write_83bytes):
- lddqu -83(%rsi), %xmm0
- movdqu %xmm0, -83(%rdi)
-L(fwd_write_67bytes):
- lddqu -67(%rsi), %xmm0
- movdqu %xmm0, -67(%rdi)
-L(fwd_write_51bytes):
- lddqu -51(%rsi), %xmm0
- movdqu %xmm0, -51(%rdi)
-L(fwd_write_35bytes):
- lddqu -35(%rsi), %xmm0
- movdqu %xmm0, -35(%rdi)
-L(fwd_write_19bytes):
- lddqu -19(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -19(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_3bytes):
- mov -3(%rsi), %dx
- mov -2(%rsi), %cx
- mov %dx, -3(%rdi)
- mov %cx, -2(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_130bytes):
- lddqu -130(%rsi), %xmm0
- movdqu %xmm0, -130(%rdi)
-L(fwd_write_114bytes):
- lddqu -114(%rsi), %xmm0
- movdqu %xmm0, -114(%rdi)
-L(fwd_write_98bytes):
- lddqu -98(%rsi), %xmm0
- movdqu %xmm0, -98(%rdi)
-L(fwd_write_82bytes):
- lddqu -82(%rsi), %xmm0
- movdqu %xmm0, -82(%rdi)
-L(fwd_write_66bytes):
- lddqu -66(%rsi), %xmm0
- movdqu %xmm0, -66(%rdi)
-L(fwd_write_50bytes):
- lddqu -50(%rsi), %xmm0
- movdqu %xmm0, -50(%rdi)
-L(fwd_write_34bytes):
- lddqu -34(%rsi), %xmm0
- movdqu %xmm0, -34(%rdi)
-L(fwd_write_18bytes):
- lddqu -18(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -18(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_2bytes):
- movzwl -2(%rsi), %edx
- mov %dx, -2(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_129bytes):
- lddqu -129(%rsi), %xmm0
- movdqu %xmm0, -129(%rdi)
-L(fwd_write_113bytes):
- lddqu -113(%rsi), %xmm0
- movdqu %xmm0, -113(%rdi)
-L(fwd_write_97bytes):
- lddqu -97(%rsi), %xmm0
- movdqu %xmm0, -97(%rdi)
-L(fwd_write_81bytes):
- lddqu -81(%rsi), %xmm0
- movdqu %xmm0, -81(%rdi)
-L(fwd_write_65bytes):
- lddqu -65(%rsi), %xmm0
- movdqu %xmm0, -65(%rdi)
-L(fwd_write_49bytes):
- lddqu -49(%rsi), %xmm0
- movdqu %xmm0, -49(%rdi)
-L(fwd_write_33bytes):
- lddqu -33(%rsi), %xmm0
- movdqu %xmm0, -33(%rdi)
-L(fwd_write_17bytes):
- lddqu -17(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -17(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_1bytes):
- movzbl -1(%rsi), %edx
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(bwd_write_128bytes):
- lddqu 112(%rsi), %xmm0
- movdqu %xmm0, 112(%rdi)
-L(bwd_write_112bytes):
- lddqu 96(%rsi), %xmm0
- movdqu %xmm0, 96(%rdi)
-L(bwd_write_96bytes):
- lddqu 80(%rsi), %xmm0
- movdqu %xmm0, 80(%rdi)
-L(bwd_write_80bytes):
- lddqu 64(%rsi), %xmm0
- movdqu %xmm0, 64(%rdi)
-L(bwd_write_64bytes):
- lddqu 48(%rsi), %xmm0
- movdqu %xmm0, 48(%rdi)
-L(bwd_write_48bytes):
- lddqu 32(%rsi), %xmm0
- movdqu %xmm0, 32(%rdi)
-L(bwd_write_32bytes):
- lddqu 16(%rsi), %xmm0
- movdqu %xmm0, 16(%rdi)
-L(bwd_write_16bytes):
- lddqu (%rsi), %xmm0
- movdqu %xmm0, (%rdi)
-L(bwd_write_0bytes):
- ret
-
- .p2align 4
-L(bwd_write_143bytes):
- lddqu 127(%rsi), %xmm0
- movdqu %xmm0, 127(%rdi)
-L(bwd_write_127bytes):
- lddqu 111(%rsi), %xmm0
- movdqu %xmm0, 111(%rdi)
-L(bwd_write_111bytes):
- lddqu 95(%rsi), %xmm0
- movdqu %xmm0, 95(%rdi)
-L(bwd_write_95bytes):
- lddqu 79(%rsi), %xmm0
- movdqu %xmm0, 79(%rdi)
-L(bwd_write_79bytes):
- lddqu 63(%rsi), %xmm0
- movdqu %xmm0, 63(%rdi)
-L(bwd_write_63bytes):
- lddqu 47(%rsi), %xmm0
- movdqu %xmm0, 47(%rdi)
-L(bwd_write_47bytes):
- lddqu 31(%rsi), %xmm0
- movdqu %xmm0, 31(%rdi)
-L(bwd_write_31bytes):
- lddqu 15(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 15(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
-
- .p2align 4
-L(bwd_write_15bytes):
- mov 7(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 7(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_142bytes):
- lddqu 126(%rsi), %xmm0
- movdqu %xmm0, 126(%rdi)
-L(bwd_write_126bytes):
- lddqu 110(%rsi), %xmm0
- movdqu %xmm0, 110(%rdi)
-L(bwd_write_110bytes):
- lddqu 94(%rsi), %xmm0
- movdqu %xmm0, 94(%rdi)
-L(bwd_write_94bytes):
- lddqu 78(%rsi), %xmm0
- movdqu %xmm0, 78(%rdi)
-L(bwd_write_78bytes):
- lddqu 62(%rsi), %xmm0
- movdqu %xmm0, 62(%rdi)
-L(bwd_write_62bytes):
- lddqu 46(%rsi), %xmm0
- movdqu %xmm0, 46(%rdi)
-L(bwd_write_46bytes):
- lddqu 30(%rsi), %xmm0
- movdqu %xmm0, 30(%rdi)
-L(bwd_write_30bytes):
- lddqu 14(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 14(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_14bytes):
- mov 6(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 6(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_141bytes):
- lddqu 125(%rsi), %xmm0
- movdqu %xmm0, 125(%rdi)
-L(bwd_write_125bytes):
- lddqu 109(%rsi), %xmm0
- movdqu %xmm0, 109(%rdi)
-L(bwd_write_109bytes):
- lddqu 93(%rsi), %xmm0
- movdqu %xmm0, 93(%rdi)
-L(bwd_write_93bytes):
- lddqu 77(%rsi), %xmm0
- movdqu %xmm0, 77(%rdi)
-L(bwd_write_77bytes):
- lddqu 61(%rsi), %xmm0
- movdqu %xmm0, 61(%rdi)
-L(bwd_write_61bytes):
- lddqu 45(%rsi), %xmm0
- movdqu %xmm0, 45(%rdi)
-L(bwd_write_45bytes):
- lddqu 29(%rsi), %xmm0
- movdqu %xmm0, 29(%rdi)
-L(bwd_write_29bytes):
- lddqu 13(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 13(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_13bytes):
- mov 5(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 5(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_140bytes):
- lddqu 124(%rsi), %xmm0
- movdqu %xmm0, 124(%rdi)
-L(bwd_write_124bytes):
- lddqu 108(%rsi), %xmm0
- movdqu %xmm0, 108(%rdi)
-L(bwd_write_108bytes):
- lddqu 92(%rsi), %xmm0
- movdqu %xmm0, 92(%rdi)
-L(bwd_write_92bytes):
- lddqu 76(%rsi), %xmm0
- movdqu %xmm0, 76(%rdi)
-L(bwd_write_76bytes):
- lddqu 60(%rsi), %xmm0
- movdqu %xmm0, 60(%rdi)
-L(bwd_write_60bytes):
- lddqu 44(%rsi), %xmm0
- movdqu %xmm0, 44(%rdi)
-L(bwd_write_44bytes):
- lddqu 28(%rsi), %xmm0
- movdqu %xmm0, 28(%rdi)
-L(bwd_write_28bytes):
- lddqu 12(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 12(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_12bytes):
- mov 4(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 4(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_139bytes):
- lddqu 123(%rsi), %xmm0
- movdqu %xmm0, 123(%rdi)
-L(bwd_write_123bytes):
- lddqu 107(%rsi), %xmm0
- movdqu %xmm0, 107(%rdi)
-L(bwd_write_107bytes):
- lddqu 91(%rsi), %xmm0
- movdqu %xmm0, 91(%rdi)
-L(bwd_write_91bytes):
- lddqu 75(%rsi), %xmm0
- movdqu %xmm0, 75(%rdi)
-L(bwd_write_75bytes):
- lddqu 59(%rsi), %xmm0
- movdqu %xmm0, 59(%rdi)
-L(bwd_write_59bytes):
- lddqu 43(%rsi), %xmm0
- movdqu %xmm0, 43(%rdi)
-L(bwd_write_43bytes):
- lddqu 27(%rsi), %xmm0
- movdqu %xmm0, 27(%rdi)
-L(bwd_write_27bytes):
- lddqu 11(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 11(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_11bytes):
- mov 3(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 3(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_138bytes):
- lddqu 122(%rsi), %xmm0
- movdqu %xmm0, 122(%rdi)
-L(bwd_write_122bytes):
- lddqu 106(%rsi), %xmm0
- movdqu %xmm0, 106(%rdi)
-L(bwd_write_106bytes):
- lddqu 90(%rsi), %xmm0
- movdqu %xmm0, 90(%rdi)
-L(bwd_write_90bytes):
- lddqu 74(%rsi), %xmm0
- movdqu %xmm0, 74(%rdi)
-L(bwd_write_74bytes):
- lddqu 58(%rsi), %xmm0
- movdqu %xmm0, 58(%rdi)
-L(bwd_write_58bytes):
- lddqu 42(%rsi), %xmm0
- movdqu %xmm0, 42(%rdi)
-L(bwd_write_42bytes):
- lddqu 26(%rsi), %xmm0
- movdqu %xmm0, 26(%rdi)
-L(bwd_write_26bytes):
- lddqu 10(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 10(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_10bytes):
- mov 2(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 2(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_137bytes):
- lddqu 121(%rsi), %xmm0
- movdqu %xmm0, 121(%rdi)
-L(bwd_write_121bytes):
- lddqu 105(%rsi), %xmm0
- movdqu %xmm0, 105(%rdi)
-L(bwd_write_105bytes):
- lddqu 89(%rsi), %xmm0
- movdqu %xmm0, 89(%rdi)
-L(bwd_write_89bytes):
- lddqu 73(%rsi), %xmm0
- movdqu %xmm0, 73(%rdi)
-L(bwd_write_73bytes):
- lddqu 57(%rsi), %xmm0
- movdqu %xmm0, 57(%rdi)
-L(bwd_write_57bytes):
- lddqu 41(%rsi), %xmm0
- movdqu %xmm0, 41(%rdi)
-L(bwd_write_41bytes):
- lddqu 25(%rsi), %xmm0
- movdqu %xmm0, 25(%rdi)
-L(bwd_write_25bytes):
- lddqu 9(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 9(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_9bytes):
- mov 1(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 1(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_136bytes):
- lddqu 120(%rsi), %xmm0
- movdqu %xmm0, 120(%rdi)
-L(bwd_write_120bytes):
- lddqu 104(%rsi), %xmm0
- movdqu %xmm0, 104(%rdi)
-L(bwd_write_104bytes):
- lddqu 88(%rsi), %xmm0
- movdqu %xmm0, 88(%rdi)
-L(bwd_write_88bytes):
- lddqu 72(%rsi), %xmm0
- movdqu %xmm0, 72(%rdi)
-L(bwd_write_72bytes):
- lddqu 56(%rsi), %xmm0
- movdqu %xmm0, 56(%rdi)
-L(bwd_write_56bytes):
- lddqu 40(%rsi), %xmm0
- movdqu %xmm0, 40(%rdi)
-L(bwd_write_40bytes):
- lddqu 24(%rsi), %xmm0
- movdqu %xmm0, 24(%rdi)
-L(bwd_write_24bytes):
- lddqu 8(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 8(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_8bytes):
- mov (%rsi), %rdx
- mov %rdx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_135bytes):
- lddqu 119(%rsi), %xmm0
- movdqu %xmm0, 119(%rdi)
-L(bwd_write_119bytes):
- lddqu 103(%rsi), %xmm0
- movdqu %xmm0, 103(%rdi)
-L(bwd_write_103bytes):
- lddqu 87(%rsi), %xmm0
- movdqu %xmm0, 87(%rdi)
-L(bwd_write_87bytes):
- lddqu 71(%rsi), %xmm0
- movdqu %xmm0, 71(%rdi)
-L(bwd_write_71bytes):
- lddqu 55(%rsi), %xmm0
- movdqu %xmm0, 55(%rdi)
-L(bwd_write_55bytes):
- lddqu 39(%rsi), %xmm0
- movdqu %xmm0, 39(%rdi)
-L(bwd_write_39bytes):
- lddqu 23(%rsi), %xmm0
- movdqu %xmm0, 23(%rdi)
-L(bwd_write_23bytes):
- lddqu 7(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 7(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_7bytes):
- mov 3(%rsi), %edx
- mov (%rsi), %ecx
- mov %edx, 3(%rdi)
- mov %ecx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_134bytes):
- lddqu 118(%rsi), %xmm0
- movdqu %xmm0, 118(%rdi)
-L(bwd_write_118bytes):
- lddqu 102(%rsi), %xmm0
- movdqu %xmm0, 102(%rdi)
-L(bwd_write_102bytes):
- lddqu 86(%rsi), %xmm0
- movdqu %xmm0, 86(%rdi)
-L(bwd_write_86bytes):
- lddqu 70(%rsi), %xmm0
- movdqu %xmm0, 70(%rdi)
-L(bwd_write_70bytes):
- lddqu 54(%rsi), %xmm0
- movdqu %xmm0, 54(%rdi)
-L(bwd_write_54bytes):
- lddqu 38(%rsi), %xmm0
- movdqu %xmm0, 38(%rdi)
-L(bwd_write_38bytes):
- lddqu 22(%rsi), %xmm0
- movdqu %xmm0, 22(%rdi)
-L(bwd_write_22bytes):
- lddqu 6(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 6(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_6bytes):
- mov 2(%rsi), %edx
- mov (%rsi), %ecx
- mov %edx, 2(%rdi)
- mov %ecx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_133bytes):
- lddqu 117(%rsi), %xmm0
- movdqu %xmm0, 117(%rdi)
-L(bwd_write_117bytes):
- lddqu 101(%rsi), %xmm0
- movdqu %xmm0, 101(%rdi)
-L(bwd_write_101bytes):
- lddqu 85(%rsi), %xmm0
- movdqu %xmm0, 85(%rdi)
-L(bwd_write_85bytes):
- lddqu 69(%rsi), %xmm0
- movdqu %xmm0, 69(%rdi)
-L(bwd_write_69bytes):
- lddqu 53(%rsi), %xmm0
- movdqu %xmm0, 53(%rdi)
-L(bwd_write_53bytes):
- lddqu 37(%rsi), %xmm0
- movdqu %xmm0, 37(%rdi)
-L(bwd_write_37bytes):
- lddqu 21(%rsi), %xmm0
- movdqu %xmm0, 21(%rdi)
-L(bwd_write_21bytes):
- lddqu 5(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 5(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_5bytes):
- mov 1(%rsi), %edx
- mov (%rsi), %ecx
- mov %edx, 1(%rdi)
- mov %ecx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_132bytes):
- lddqu 116(%rsi), %xmm0
- movdqu %xmm0, 116(%rdi)
-L(bwd_write_116bytes):
- lddqu 100(%rsi), %xmm0
- movdqu %xmm0, 100(%rdi)
-L(bwd_write_100bytes):
- lddqu 84(%rsi), %xmm0
- movdqu %xmm0, 84(%rdi)
-L(bwd_write_84bytes):
- lddqu 68(%rsi), %xmm0
- movdqu %xmm0, 68(%rdi)
-L(bwd_write_68bytes):
- lddqu 52(%rsi), %xmm0
- movdqu %xmm0, 52(%rdi)
-L(bwd_write_52bytes):
- lddqu 36(%rsi), %xmm0
- movdqu %xmm0, 36(%rdi)
-L(bwd_write_36bytes):
- lddqu 20(%rsi), %xmm0
- movdqu %xmm0, 20(%rdi)
-L(bwd_write_20bytes):
- lddqu 4(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 4(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_4bytes):
- mov (%rsi), %edx
- mov %edx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_131bytes):
- lddqu 115(%rsi), %xmm0
- movdqu %xmm0, 115(%rdi)
-L(bwd_write_115bytes):
- lddqu 99(%rsi), %xmm0
- movdqu %xmm0, 99(%rdi)
-L(bwd_write_99bytes):
- lddqu 83(%rsi), %xmm0
- movdqu %xmm0, 83(%rdi)
-L(bwd_write_83bytes):
- lddqu 67(%rsi), %xmm0
- movdqu %xmm0, 67(%rdi)
-L(bwd_write_67bytes):
- lddqu 51(%rsi), %xmm0
- movdqu %xmm0, 51(%rdi)
-L(bwd_write_51bytes):
- lddqu 35(%rsi), %xmm0
- movdqu %xmm0, 35(%rdi)
-L(bwd_write_35bytes):
- lddqu 19(%rsi), %xmm0
- movdqu %xmm0, 19(%rdi)
-L(bwd_write_19bytes):
- lddqu 3(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 3(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_3bytes):
- mov 1(%rsi), %dx
- mov (%rsi), %cx
- mov %dx, 1(%rdi)
- mov %cx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_130bytes):
- lddqu 114(%rsi), %xmm0
- movdqu %xmm0, 114(%rdi)
-L(bwd_write_114bytes):
- lddqu 98(%rsi), %xmm0
- movdqu %xmm0, 98(%rdi)
-L(bwd_write_98bytes):
- lddqu 82(%rsi), %xmm0
- movdqu %xmm0, 82(%rdi)
-L(bwd_write_82bytes):
- lddqu 66(%rsi), %xmm0
- movdqu %xmm0, 66(%rdi)
-L(bwd_write_66bytes):
- lddqu 50(%rsi), %xmm0
- movdqu %xmm0, 50(%rdi)
-L(bwd_write_50bytes):
- lddqu 34(%rsi), %xmm0
- movdqu %xmm0, 34(%rdi)
-L(bwd_write_34bytes):
- lddqu 18(%rsi), %xmm0
- movdqu %xmm0, 18(%rdi)
-L(bwd_write_18bytes):
- lddqu 2(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 2(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_2bytes):
- movzwl (%rsi), %edx
- mov %dx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_129bytes):
- lddqu 113(%rsi), %xmm0
- movdqu %xmm0, 113(%rdi)
-L(bwd_write_113bytes):
- lddqu 97(%rsi), %xmm0
- movdqu %xmm0, 97(%rdi)
-L(bwd_write_97bytes):
- lddqu 81(%rsi), %xmm0
- movdqu %xmm0, 81(%rdi)
-L(bwd_write_81bytes):
- lddqu 65(%rsi), %xmm0
- movdqu %xmm0, 65(%rdi)
-L(bwd_write_65bytes):
- lddqu 49(%rsi), %xmm0
- movdqu %xmm0, 49(%rdi)
-L(bwd_write_49bytes):
- lddqu 33(%rsi), %xmm0
- movdqu %xmm0, 33(%rdi)
-L(bwd_write_33bytes):
- lddqu 17(%rsi), %xmm0
- movdqu %xmm0, 17(%rdi)
-L(bwd_write_17bytes):
- lddqu 1(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 1(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_1bytes):
- movzbl (%rsi), %edx
- mov %dl, (%rdi)
- ret
-
-END (MEMCPY)
-
- .section .rodata.ssse3,"a",@progbits
- .p2align 3
-L(table_144_bytes_bwd):
- .int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
-
- .p2align 3
-L(table_144_bytes_fwd):
- .int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
-
- .p2align 3
-L(shl_table_fwd):
- .int JMPTBL (L(shl_0), L(shl_table_fwd))
- .int JMPTBL (L(shl_1), L(shl_table_fwd))
- .int JMPTBL (L(shl_2), L(shl_table_fwd))
- .int JMPTBL (L(shl_3), L(shl_table_fwd))
- .int JMPTBL (L(shl_4), L(shl_table_fwd))
- .int JMPTBL (L(shl_5), L(shl_table_fwd))
- .int JMPTBL (L(shl_6), L(shl_table_fwd))
- .int JMPTBL (L(shl_7), L(shl_table_fwd))
- .int JMPTBL (L(shl_8), L(shl_table_fwd))
- .int JMPTBL (L(shl_9), L(shl_table_fwd))
- .int JMPTBL (L(shl_10), L(shl_table_fwd))
- .int JMPTBL (L(shl_11), L(shl_table_fwd))
- .int JMPTBL (L(shl_12), L(shl_table_fwd))
- .int JMPTBL (L(shl_13), L(shl_table_fwd))
- .int JMPTBL (L(shl_14), L(shl_table_fwd))
- .int JMPTBL (L(shl_15), L(shl_table_fwd))
-
- .p2align 3
-L(shl_table_bwd):
- .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
deleted file mode 100644
index f3ea52a46c..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ /dev/null
@@ -1,3150 +0,0 @@
-/* memcpy with SSSE3
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#if IS_IN (libc) \
- && (defined SHARED \
- || defined USE_AS_MEMMOVE \
- || !defined USE_MULTIARCH)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY __memcpy_ssse3
-# define MEMCPY_CHK __memcpy_chk_ssse3
-# define MEMPCPY __mempcpy_ssse3
-# define MEMPCPY_CHK __mempcpy_chk_ssse3
-#endif
-
-#define JMPTBL(I, B) I - B
-
-/* Branch to an entry in a jump table. TABLE is a jump table with
- relative offsets. INDEX is a register contains the index into the
- jump table. SCALE is the scale of INDEX. */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- lea TABLE(%rip), %r11; \
- movslq (%r11, INDEX, SCALE), INDEX; \
- lea (%r11, INDEX), INDEX; \
- jmp *INDEX; \
- ud2
-
- .section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
- movq %rdi, %rax
- addq %rdx, %rax
- jmp L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
- mov %rdi, %rax
-#ifdef USE_AS_MEMPCPY
- add %rdx, %rax
-#endif
-
-#ifdef USE_AS_MEMMOVE
- cmp %rsi, %rdi
- jb L(copy_forward)
- je L(write_0bytes)
- cmp $79, %rdx
- jbe L(copy_forward)
- jmp L(copy_backward)
-L(copy_forward):
-#endif
-L(start):
- cmp $79, %rdx
- lea L(table_less_80bytes)(%rip), %r11
- ja L(80bytesormore)
- movslq (%r11, %rdx, 4), %r9
- add %rdx, %rsi
- add %rdx, %rdi
- add %r11, %r9
- jmp *%r9
- ud2
-
- .p2align 4
-L(80bytesormore):
-#ifndef USE_AS_MEMMOVE
- cmp %dil, %sil
- jle L(copy_backward)
-#endif
-
- movdqu (%rsi), %xmm0
- mov %rdi, %rcx
- and $-16, %rdi
- add $16, %rdi
- mov %rcx, %r8
- sub %rdi, %rcx
- add %rcx, %rdx
- sub %rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
- cmp %rcx, %rdx
- mov %rsi, %r9
- ja L(large_page_fwd)
- and $0xf, %r9
- jz L(shl_0)
-#ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_data_cache_size_half(%rip), %RCX_LP
-#endif
- BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
-
- .p2align 4
-L(copy_backward):
- movdqu -16(%rsi, %rdx), %xmm0
- add %rdx, %rsi
- lea -16(%rdi, %rdx), %r8
- add %rdx, %rdi
-
- mov %rdi, %rcx
- and $0xf, %rcx
- xor %rcx, %rdi
- sub %rcx, %rdx
- sub %rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-
- cmp %rcx, %rdx
- mov %rsi, %r9
- ja L(large_page_bwd)
- and $0xf, %r9
- jz L(shl_0_bwd)
-#ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_data_cache_size_half(%rip), %RCX_LP
-#endif
- BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
-
- .p2align 4
-L(shl_0):
- sub $16, %rdx
- movdqa (%rsi), %xmm1
- add $16, %rsi
- movdqa %xmm1, (%rdi)
- add $16, %rdi
- cmp $128, %rdx
- movdqu %xmm0, (%r8)
- ja L(shl_0_gobble)
- cmp $64, %rdx
- jb L(shl_0_less_64bytes)
- movaps (%rsi), %xmm4
- movaps 16(%rsi), %xmm1
- movaps 32(%rsi), %xmm2
- movaps 48(%rsi), %xmm3
- movaps %xmm4, (%rdi)
- movaps %xmm1, 16(%rdi)
- movaps %xmm2, 32(%rdi)
- movaps %xmm3, 48(%rdi)
- sub $64, %rdx
- add $64, %rsi
- add $64, %rdi
-L(shl_0_less_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble):
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
- cmp __x86_data_cache_size_half(%rip), %RDX_LP
-#endif
- lea -128(%rdx), %rdx
- jae L(shl_0_gobble_mem_loop)
-L(shl_0_gobble_cache_loop):
- movdqa (%rsi), %xmm4
- movaps 0x10(%rsi), %xmm1
- movaps 0x20(%rsi), %xmm2
- movaps 0x30(%rsi), %xmm3
-
- movdqa %xmm4, (%rdi)
- movaps %xmm1, 0x10(%rdi)
- movaps %xmm2, 0x20(%rdi)
- movaps %xmm3, 0x30(%rdi)
-
- sub $128, %rdx
- movaps 0x40(%rsi), %xmm4
- movaps 0x50(%rsi), %xmm5
- movaps 0x60(%rsi), %xmm6
- movaps 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
- movaps %xmm4, 0x40(%rdi)
- movaps %xmm5, 0x50(%rdi)
- movaps %xmm6, 0x60(%rdi)
- movaps %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_cache_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_cache_less_64bytes)
-
- movdqa (%rsi), %xmm4
- sub $0x40, %rdx
- movdqa 0x10(%rsi), %xmm1
-
- movdqa %xmm4, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
-
- movdqa 0x20(%rsi), %xmm4
- movdqa 0x30(%rsi), %xmm1
- add $0x40, %rsi
-
- movdqa %xmm4, 0x20(%rdi)
- movdqa %xmm1, 0x30(%rdi)
- add $0x40, %rdi
-L(shl_0_cache_less_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble_mem_loop):
- prefetcht0 0x1c0(%rsi)
- prefetcht0 0x280(%rsi)
-
- movdqa (%rsi), %xmm0
- movdqa 0x10(%rsi), %xmm1
- movdqa 0x20(%rsi), %xmm2
- movdqa 0x30(%rsi), %xmm3
- movdqa 0x40(%rsi), %xmm4
- movdqa 0x50(%rsi), %xmm5
- movdqa 0x60(%rsi), %xmm6
- movdqa 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
- sub $0x80, %rdx
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
- movdqa %xmm2, 0x20(%rdi)
- movdqa %xmm3, 0x30(%rdi)
- movdqa %xmm4, 0x40(%rdi)
- movdqa %xmm5, 0x50(%rdi)
- movdqa %xmm6, 0x60(%rdi)
- movdqa %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_mem_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_mem_less_64bytes)
-
- movdqa (%rsi), %xmm0
- sub $0x40, %rdx
- movdqa 0x10(%rsi), %xmm1
-
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
-
- movdqa 0x20(%rsi), %xmm0
- movdqa 0x30(%rsi), %xmm1
- add $0x40, %rsi
-
- movdqa %xmm0, 0x20(%rdi)
- movdqa %xmm1, 0x30(%rdi)
- add $0x40, %rdi
-L(shl_0_mem_less_64bytes):
- cmp $0x20, %rdx
- jb L(shl_0_mem_less_32bytes)
- movdqa (%rsi), %xmm0
- sub $0x20, %rdx
- movdqa 0x10(%rsi), %xmm1
- add $0x20, %rsi
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
- add $0x20, %rdi
-L(shl_0_mem_less_32bytes):
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_bwd):
- sub $16, %rdx
- movdqa -0x10(%rsi), %xmm1
- sub $16, %rsi
- movdqa %xmm1, -0x10(%rdi)
- sub $16, %rdi
- cmp $0x80, %rdx
- movdqu %xmm0, (%r8)
- ja L(shl_0_gobble_bwd)
- cmp $64, %rdx
- jb L(shl_0_less_64bytes_bwd)
- movaps -0x10(%rsi), %xmm0
- movaps -0x20(%rsi), %xmm1
- movaps -0x30(%rsi), %xmm2
- movaps -0x40(%rsi), %xmm3
- movaps %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
- sub $64, %rdx
- sub $0x40, %rsi
- sub $0x40, %rdi
-L(shl_0_less_64bytes_bwd):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble_bwd):
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
- cmp __x86_data_cache_size_half(%rip), %RDX_LP
-#endif
- lea -128(%rdx), %rdx
- jae L(shl_0_gobble_mem_bwd_loop)
-L(shl_0_gobble_bwd_loop):
- movdqa -0x10(%rsi), %xmm0
- movaps -0x20(%rsi), %xmm1
- movaps -0x30(%rsi), %xmm2
- movaps -0x40(%rsi), %xmm3
-
- movdqa %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
-
- sub $0x80, %rdx
- movaps -0x50(%rsi), %xmm4
- movaps -0x60(%rsi), %xmm5
- movaps -0x70(%rsi), %xmm6
- movaps -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
- movaps %xmm4, -0x50(%rdi)
- movaps %xmm5, -0x60(%rdi)
- movaps %xmm6, -0x70(%rdi)
- movaps %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_bwd_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_gobble_bwd_less_64bytes)
-
- movdqa -0x10(%rsi), %xmm0
- sub $0x40, %rdx
- movdqa -0x20(%rsi), %xmm1
-
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
-
- movdqa -0x30(%rsi), %xmm0
- movdqa -0x40(%rsi), %xmm1
- sub $0x40, %rsi
-
- movdqa %xmm0, -0x30(%rdi)
- movdqa %xmm1, -0x40(%rdi)
- sub $0x40, %rdi
-L(shl_0_gobble_bwd_less_64bytes):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble_mem_bwd_loop):
- prefetcht0 -0x1c0(%rsi)
- prefetcht0 -0x280(%rsi)
- movdqa -0x10(%rsi), %xmm0
- movdqa -0x20(%rsi), %xmm1
- movdqa -0x30(%rsi), %xmm2
- movdqa -0x40(%rsi), %xmm3
- movdqa -0x50(%rsi), %xmm4
- movdqa -0x60(%rsi), %xmm5
- movdqa -0x70(%rsi), %xmm6
- movdqa -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
- sub $0x80, %rdx
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
- movdqa %xmm2, -0x30(%rdi)
- movdqa %xmm3, -0x40(%rdi)
- movdqa %xmm4, -0x50(%rdi)
- movdqa %xmm5, -0x60(%rdi)
- movdqa %xmm6, -0x70(%rdi)
- movdqa %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_mem_bwd_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_mem_bwd_less_64bytes)
-
- movdqa -0x10(%rsi), %xmm0
- sub $0x40, %rdx
- movdqa -0x20(%rsi), %xmm1
-
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
-
- movdqa -0x30(%rsi), %xmm0
- movdqa -0x40(%rsi), %xmm1
- sub $0x40, %rsi
-
- movdqa %xmm0, -0x30(%rdi)
- movdqa %xmm1, -0x40(%rdi)
- sub $0x40, %rdi
-L(shl_0_mem_bwd_less_64bytes):
- cmp $0x20, %rdx
- jb L(shl_0_mem_bwd_less_32bytes)
- movdqa -0x10(%rsi), %xmm0
- sub $0x20, %rdx
- movdqa -0x20(%rsi), %xmm1
- sub $0x20, %rsi
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
- sub $0x20, %rdi
-L(shl_0_mem_bwd_less_32bytes):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_1):
- lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x01(%rsi), %xmm1
- jb L(L1_fwd)
- lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
-L(L1_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_1_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_1_loop_L1):
- sub $64, %rdx
- movaps 0x0f(%rsi), %xmm2
- movaps 0x1f(%rsi), %xmm3
- movaps 0x2f(%rsi), %xmm4
- movaps 0x3f(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $1, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $1, %xmm3, %xmm4
- palignr $1, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $1, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_1_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_1_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_1_bwd):
- lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x01(%rsi), %xmm1
- jb L(L1_bwd)
- lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
-L(L1_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_1_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_1_bwd_loop_L1):
- movaps -0x11(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x21(%rsi), %xmm3
- movaps -0x31(%rsi), %xmm4
- movaps -0x41(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $1, %xmm2, %xmm1
- palignr $1, %xmm3, %xmm2
- palignr $1, %xmm4, %xmm3
- palignr $1, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_1_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_1_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_2):
- lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x02(%rsi), %xmm1
- jb L(L2_fwd)
- lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
-L(L2_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_2_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_2_loop_L1):
- sub $64, %rdx
- movaps 0x0e(%rsi), %xmm2
- movaps 0x1e(%rsi), %xmm3
- movaps 0x2e(%rsi), %xmm4
- movaps 0x3e(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $2, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $2, %xmm3, %xmm4
- palignr $2, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $2, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_2_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_2_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_2_bwd):
- lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x02(%rsi), %xmm1
- jb L(L2_bwd)
- lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
-L(L2_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_2_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_2_bwd_loop_L1):
- movaps -0x12(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x22(%rsi), %xmm3
- movaps -0x32(%rsi), %xmm4
- movaps -0x42(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $2, %xmm2, %xmm1
- palignr $2, %xmm3, %xmm2
- palignr $2, %xmm4, %xmm3
- palignr $2, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_2_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_2_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_3):
- lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x03(%rsi), %xmm1
- jb L(L3_fwd)
- lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
-L(L3_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_3_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_3_loop_L1):
- sub $64, %rdx
- movaps 0x0d(%rsi), %xmm2
- movaps 0x1d(%rsi), %xmm3
- movaps 0x2d(%rsi), %xmm4
- movaps 0x3d(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $3, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $3, %xmm3, %xmm4
- palignr $3, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $3, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_3_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_3_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_3_bwd):
- lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x03(%rsi), %xmm1
- jb L(L3_bwd)
- lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
-L(L3_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_3_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_3_bwd_loop_L1):
- movaps -0x13(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x23(%rsi), %xmm3
- movaps -0x33(%rsi), %xmm4
- movaps -0x43(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $3, %xmm2, %xmm1
- palignr $3, %xmm3, %xmm2
- palignr $3, %xmm4, %xmm3
- palignr $3, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_3_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_3_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_4):
- lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x04(%rsi), %xmm1
- jb L(L4_fwd)
- lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
-L(L4_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_4_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_4_loop_L1):
- sub $64, %rdx
- movaps 0x0c(%rsi), %xmm2
- movaps 0x1c(%rsi), %xmm3
- movaps 0x2c(%rsi), %xmm4
- movaps 0x3c(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $4, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $4, %xmm3, %xmm4
- palignr $4, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $4, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_4_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_4_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_4_bwd):
- lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x04(%rsi), %xmm1
- jb L(L4_bwd)
- lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
-L(L4_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_4_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_4_bwd_loop_L1):
- movaps -0x14(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x24(%rsi), %xmm3
- movaps -0x34(%rsi), %xmm4
- movaps -0x44(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $4, %xmm2, %xmm1
- palignr $4, %xmm3, %xmm2
- palignr $4, %xmm4, %xmm3
- palignr $4, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_4_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_4_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_5):
- lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x05(%rsi), %xmm1
- jb L(L5_fwd)
- lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
-L(L5_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_5_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_5_loop_L1):
- sub $64, %rdx
- movaps 0x0b(%rsi), %xmm2
- movaps 0x1b(%rsi), %xmm3
- movaps 0x2b(%rsi), %xmm4
- movaps 0x3b(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $5, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $5, %xmm3, %xmm4
- palignr $5, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $5, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_5_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_5_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_5_bwd):
- lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x05(%rsi), %xmm1
- jb L(L5_bwd)
- lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
-L(L5_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_5_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_5_bwd_loop_L1):
- movaps -0x15(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x25(%rsi), %xmm3
- movaps -0x35(%rsi), %xmm4
- movaps -0x45(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $5, %xmm2, %xmm1
- palignr $5, %xmm3, %xmm2
- palignr $5, %xmm4, %xmm3
- palignr $5, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_5_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_5_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_6):
- lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x06(%rsi), %xmm1
- jb L(L6_fwd)
- lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
-L(L6_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_6_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_6_loop_L1):
- sub $64, %rdx
- movaps 0x0a(%rsi), %xmm2
- movaps 0x1a(%rsi), %xmm3
- movaps 0x2a(%rsi), %xmm4
- movaps 0x3a(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $6, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $6, %xmm3, %xmm4
- palignr $6, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $6, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_6_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_6_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_6_bwd):
- lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x06(%rsi), %xmm1
- jb L(L6_bwd)
- lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
-L(L6_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_6_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_6_bwd_loop_L1):
- movaps -0x16(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x26(%rsi), %xmm3
- movaps -0x36(%rsi), %xmm4
- movaps -0x46(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $6, %xmm2, %xmm1
- palignr $6, %xmm3, %xmm2
- palignr $6, %xmm4, %xmm3
- palignr $6, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_6_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_6_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_7):
- lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x07(%rsi), %xmm1
- jb L(L7_fwd)
- lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
-L(L7_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_7_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_7_loop_L1):
- sub $64, %rdx
- movaps 0x09(%rsi), %xmm2
- movaps 0x19(%rsi), %xmm3
- movaps 0x29(%rsi), %xmm4
- movaps 0x39(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $7, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $7, %xmm3, %xmm4
- palignr $7, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $7, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_7_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_7_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_7_bwd):
- lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x07(%rsi), %xmm1
- jb L(L7_bwd)
- lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
-L(L7_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_7_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_7_bwd_loop_L1):
- movaps -0x17(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x27(%rsi), %xmm3
- movaps -0x37(%rsi), %xmm4
- movaps -0x47(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $7, %xmm2, %xmm1
- palignr $7, %xmm3, %xmm2
- palignr $7, %xmm4, %xmm3
- palignr $7, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_7_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_7_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_8):
- lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x08(%rsi), %xmm1
- jb L(L8_fwd)
- lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
-L(L8_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
-L(shl_8_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_8_loop_L1):
- sub $64, %rdx
- movaps 0x08(%rsi), %xmm2
- movaps 0x18(%rsi), %xmm3
- movaps 0x28(%rsi), %xmm4
- movaps 0x38(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $8, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $8, %xmm3, %xmm4
- palignr $8, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $8, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_8_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
- .p2align 4
-L(shl_8_end):
- lea 64(%rdx), %rdx
- movaps %xmm4, -0x20(%rdi)
- add %rdx, %rsi
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_8_bwd):
- lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x08(%rsi), %xmm1
- jb L(L8_bwd)
- lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
-L(L8_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_8_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_8_bwd_loop_L1):
- movaps -0x18(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x28(%rsi), %xmm3
- movaps -0x38(%rsi), %xmm4
- movaps -0x48(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $8, %xmm2, %xmm1
- palignr $8, %xmm3, %xmm2
- palignr $8, %xmm4, %xmm3
- palignr $8, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_8_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_8_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_9):
- lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x09(%rsi), %xmm1
- jb L(L9_fwd)
- lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
-L(L9_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_9_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_9_loop_L1):
- sub $64, %rdx
- movaps 0x07(%rsi), %xmm2
- movaps 0x17(%rsi), %xmm3
- movaps 0x27(%rsi), %xmm4
- movaps 0x37(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $9, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $9, %xmm3, %xmm4
- palignr $9, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $9, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_9_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_9_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_9_bwd):
- lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x09(%rsi), %xmm1
- jb L(L9_bwd)
- lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
-L(L9_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_9_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_9_bwd_loop_L1):
- movaps -0x19(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x29(%rsi), %xmm3
- movaps -0x39(%rsi), %xmm4
- movaps -0x49(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $9, %xmm2, %xmm1
- palignr $9, %xmm3, %xmm2
- palignr $9, %xmm4, %xmm3
- palignr $9, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_9_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_9_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_10):
- lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0a(%rsi), %xmm1
- jb L(L10_fwd)
- lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
-L(L10_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_10_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_10_loop_L1):
- sub $64, %rdx
- movaps 0x06(%rsi), %xmm2
- movaps 0x16(%rsi), %xmm3
- movaps 0x26(%rsi), %xmm4
- movaps 0x36(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $10, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $10, %xmm3, %xmm4
- palignr $10, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $10, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_10_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_10_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_10_bwd):
- lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0a(%rsi), %xmm1
- jb L(L10_bwd)
- lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
-L(L10_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_10_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_10_bwd_loop_L1):
- movaps -0x1a(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2a(%rsi), %xmm3
- movaps -0x3a(%rsi), %xmm4
- movaps -0x4a(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $10, %xmm2, %xmm1
- palignr $10, %xmm3, %xmm2
- palignr $10, %xmm4, %xmm3
- palignr $10, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_10_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_10_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_11):
- lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0b(%rsi), %xmm1
- jb L(L11_fwd)
- lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
-L(L11_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_11_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_11_loop_L1):
- sub $64, %rdx
- movaps 0x05(%rsi), %xmm2
- movaps 0x15(%rsi), %xmm3
- movaps 0x25(%rsi), %xmm4
- movaps 0x35(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $11, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $11, %xmm3, %xmm4
- palignr $11, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $11, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_11_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_11_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_11_bwd):
- lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0b(%rsi), %xmm1
- jb L(L11_bwd)
- lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
-L(L11_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_11_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_11_bwd_loop_L1):
- movaps -0x1b(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2b(%rsi), %xmm3
- movaps -0x3b(%rsi), %xmm4
- movaps -0x4b(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $11, %xmm2, %xmm1
- palignr $11, %xmm3, %xmm2
- palignr $11, %xmm4, %xmm3
- palignr $11, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_11_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_11_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_12):
- lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0c(%rsi), %xmm1
- jb L(L12_fwd)
- lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
-L(L12_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_12_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_12_loop_L1):
- sub $64, %rdx
- movaps 0x04(%rsi), %xmm2
- movaps 0x14(%rsi), %xmm3
- movaps 0x24(%rsi), %xmm4
- movaps 0x34(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $12, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $12, %xmm3, %xmm4
- palignr $12, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $12, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_12_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_12_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_12_bwd):
- lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0c(%rsi), %xmm1
- jb L(L12_bwd)
- lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
-L(L12_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_12_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_12_bwd_loop_L1):
- movaps -0x1c(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2c(%rsi), %xmm3
- movaps -0x3c(%rsi), %xmm4
- movaps -0x4c(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $12, %xmm2, %xmm1
- palignr $12, %xmm3, %xmm2
- palignr $12, %xmm4, %xmm3
- palignr $12, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_12_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_12_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_13):
- lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0d(%rsi), %xmm1
- jb L(L13_fwd)
- lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
-L(L13_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_13_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_13_loop_L1):
- sub $64, %rdx
- movaps 0x03(%rsi), %xmm2
- movaps 0x13(%rsi), %xmm3
- movaps 0x23(%rsi), %xmm4
- movaps 0x33(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $13, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $13, %xmm3, %xmm4
- palignr $13, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $13, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_13_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_13_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_13_bwd):
- lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0d(%rsi), %xmm1
- jb L(L13_bwd)
- lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
-L(L13_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_13_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_13_bwd_loop_L1):
- movaps -0x1d(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2d(%rsi), %xmm3
- movaps -0x3d(%rsi), %xmm4
- movaps -0x4d(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $13, %xmm2, %xmm1
- palignr $13, %xmm3, %xmm2
- palignr $13, %xmm4, %xmm3
- palignr $13, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_13_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_13_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_14):
- lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0e(%rsi), %xmm1
- jb L(L14_fwd)
- lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
-L(L14_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_14_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_14_loop_L1):
- sub $64, %rdx
- movaps 0x02(%rsi), %xmm2
- movaps 0x12(%rsi), %xmm3
- movaps 0x22(%rsi), %xmm4
- movaps 0x32(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $14, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $14, %xmm3, %xmm4
- palignr $14, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $14, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_14_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_14_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_14_bwd):
- lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0e(%rsi), %xmm1
- jb L(L14_bwd)
- lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
-L(L14_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_14_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_14_bwd_loop_L1):
- movaps -0x1e(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2e(%rsi), %xmm3
- movaps -0x3e(%rsi), %xmm4
- movaps -0x4e(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $14, %xmm2, %xmm1
- palignr $14, %xmm3, %xmm2
- palignr $14, %xmm4, %xmm3
- palignr $14, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_14_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_14_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_15):
- lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0f(%rsi), %xmm1
- jb L(L15_fwd)
- lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
-L(L15_fwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_15_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_15_loop_L1):
- sub $64, %rdx
- movaps 0x01(%rsi), %xmm2
- movaps 0x11(%rsi), %xmm3
- movaps 0x21(%rsi), %xmm4
- movaps 0x31(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $15, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $15, %xmm3, %xmm4
- palignr $15, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $15, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_15_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- jmp *%r9
- ud2
-L(shl_15_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_15_bwd):
- lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0f(%rsi), %xmm1
- jb L(L15_bwd)
- lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
-L(L15_bwd):
- lea -64(%rdx), %rdx
- jmp *%r9
- ud2
-L(shl_15_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_15_bwd_loop_L1):
- movaps -0x1f(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2f(%rsi), %xmm3
- movaps -0x3f(%rsi), %xmm4
- movaps -0x4f(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $15, %xmm2, %xmm1
- palignr $15, %xmm3, %xmm2
- palignr $15, %xmm4, %xmm3
- palignr $15, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_15_bwd_end)
- movaps %xmm4, (%rdi)
- jmp *%r9
- ud2
-L(shl_15_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(write_72bytes):
- movdqu -72(%rsi), %xmm0
- movdqu -56(%rsi), %xmm1
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rcx
- movdqu %xmm0, -72(%rdi)
- movdqu %xmm1, -56(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_64bytes):
- movdqu -64(%rsi), %xmm0
- mov -48(%rsi), %rcx
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- movdqu %xmm0, -64(%rdi)
- mov %rcx, -48(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_56bytes):
- movdqu -56(%rsi), %xmm0
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rcx
- movdqu %xmm0, -56(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_48bytes):
- mov -48(%rsi), %rcx
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %rcx, -48(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_40bytes):
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_32bytes):
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_24bytes):
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_16bytes):
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_8bytes):
- mov -8(%rsi), %rdx
- mov %rdx, -8(%rdi)
-L(write_0bytes):
- ret
-
- .p2align 4
-L(write_73bytes):
- movdqu -73(%rsi), %xmm0
- movdqu -57(%rsi), %xmm1
- mov -41(%rsi), %rcx
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %r8
- mov -4(%rsi), %edx
- movdqu %xmm0, -73(%rdi)
- movdqu %xmm1, -57(%rdi)
- mov %rcx, -41(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %r8, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_65bytes):
- movdqu -65(%rsi), %xmm0
- movdqu -49(%rsi), %xmm1
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -65(%rdi)
- movdqu %xmm1, -49(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_57bytes):
- movdqu -57(%rsi), %xmm0
- mov -41(%rsi), %r8
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -57(%rdi)
- mov %r8, -41(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_49bytes):
- movdqu -49(%rsi), %xmm0
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -49(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_41bytes):
- mov -41(%rsi), %r8
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -1(%rsi), %dl
- mov %r8, -41(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_33bytes):
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -1(%rsi), %dl
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_25bytes):
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -1(%rsi), %dl
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_17bytes):
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_9bytes):
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_1bytes):
- mov -1(%rsi), %dl
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_74bytes):
- movdqu -74(%rsi), %xmm0
- movdqu -58(%rsi), %xmm1
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -74(%rdi)
- movdqu %xmm1, -58(%rdi)
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_66bytes):
- movdqu -66(%rsi), %xmm0
- movdqu -50(%rsi), %xmm1
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -66(%rdi)
- movdqu %xmm1, -50(%rdi)
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_58bytes):
- movdqu -58(%rsi), %xmm1
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm1, -58(%rdi)
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_50bytes):
- movdqu -50(%rsi), %xmm0
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -50(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_42bytes):
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_34bytes):
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_26bytes):
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_18bytes):
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_10bytes):
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_2bytes):
- mov -2(%rsi), %dx
- mov %dx, -2(%rdi)
- ret
-
- .p2align 4
-L(write_75bytes):
- movdqu -75(%rsi), %xmm0
- movdqu -59(%rsi), %xmm1
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -75(%rdi)
- movdqu %xmm1, -59(%rdi)
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_67bytes):
- movdqu -67(%rsi), %xmm0
- movdqu -59(%rsi), %xmm1
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -67(%rdi)
- movdqu %xmm1, -59(%rdi)
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_59bytes):
- movdqu -59(%rsi), %xmm0
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -59(%rdi)
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_51bytes):
- movdqu -51(%rsi), %xmm0
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -51(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_43bytes):
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_35bytes):
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_27bytes):
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_19bytes):
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_11bytes):
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_3bytes):
- mov -3(%rsi), %dx
- mov -2(%rsi), %cx
- mov %dx, -3(%rdi)
- mov %cx, -2(%rdi)
- ret
-
- .p2align 4
-L(write_76bytes):
- movdqu -76(%rsi), %xmm0
- movdqu -60(%rsi), %xmm1
- mov -44(%rsi), %r8
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -76(%rdi)
- movdqu %xmm1, -60(%rdi)
- mov %r8, -44(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_68bytes):
- movdqu -68(%rsi), %xmm0
- movdqu -52(%rsi), %xmm1
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -68(%rdi)
- movdqu %xmm1, -52(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_60bytes):
- movdqu -60(%rsi), %xmm0
- mov -44(%rsi), %r8
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -60(%rdi)
- mov %r8, -44(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_52bytes):
- movdqu -52(%rsi), %xmm0
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -52(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_44bytes):
- mov -44(%rsi), %r8
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r8, -44(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_36bytes):
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_28bytes):
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_20bytes):
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_12bytes):
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_4bytes):
- mov -4(%rsi), %edx
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_77bytes):
- movdqu -77(%rsi), %xmm0
- movdqu -61(%rsi), %xmm1
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -77(%rdi)
- movdqu %xmm1, -61(%rdi)
- mov %r8, -45(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_69bytes):
- movdqu -69(%rsi), %xmm0
- movdqu -53(%rsi), %xmm1
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -69(%rdi)
- movdqu %xmm1, -53(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_61bytes):
- movdqu -61(%rsi), %xmm0
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -61(%rdi)
- mov %r8, -45(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_53bytes):
- movdqu -53(%rsi), %xmm0
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -53(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_45bytes):
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r8, -45(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_37bytes):
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_29bytes):
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_21bytes):
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_13bytes):
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_5bytes):
- mov -5(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -5(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_78bytes):
- movdqu -78(%rsi), %xmm0
- movdqu -62(%rsi), %xmm1
- mov -46(%rsi), %r8
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -78(%rdi)
- movdqu %xmm1, -62(%rdi)
- mov %r8, -46(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_70bytes):
- movdqu -70(%rsi), %xmm0
- movdqu -54(%rsi), %xmm1
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -70(%rdi)
- movdqu %xmm1, -54(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_62bytes):
- movdqu -62(%rsi), %xmm0
- mov -46(%rsi), %r8
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -62(%rdi)
- mov %r8, -46(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_54bytes):
- movdqu -54(%rsi), %xmm0
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -54(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_46bytes):
- mov -46(%rsi), %r8
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r8, -46(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_38bytes):
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_30bytes):
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_22bytes):
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_14bytes):
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_6bytes):
- mov -6(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -6(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_79bytes):
- movdqu -79(%rsi), %xmm0
- movdqu -63(%rsi), %xmm1
- mov -47(%rsi), %r8
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -79(%rdi)
- movdqu %xmm1, -63(%rdi)
- mov %r8, -47(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_71bytes):
- movdqu -71(%rsi), %xmm0
- movdqu -55(%rsi), %xmm1
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -71(%rdi)
- movdqu %xmm1, -55(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_63bytes):
- movdqu -63(%rsi), %xmm0
- mov -47(%rsi), %r8
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -63(%rdi)
- mov %r8, -47(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_55bytes):
- movdqu -55(%rsi), %xmm0
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -55(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_47bytes):
- mov -47(%rsi), %r8
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r8, -47(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_39bytes):
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_31bytes):
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_23bytes):
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_15bytes):
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_7bytes):
- mov -7(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -7(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(large_page_fwd):
- movdqu (%rsi), %xmm1
- lea 16(%rsi), %rsi
- movdqu %xmm0, (%r8)
- movntdq %xmm1, (%rdi)
- lea 16(%rdi), %rdi
- lea -0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
- mov %rsi, %r9
- sub %rdi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_fwd)
- shl $2, %rcx
- cmp %rcx, %rdx
- jb L(ll_cache_copy_fwd_start)
-L(memmove_is_memcpy_fwd):
-#endif
-L(large_page_loop):
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movntdq %xmm0, (%rdi)
- movntdq %xmm1, 0x10(%rdi)
- movntdq %xmm2, 0x20(%rdi)
- movntdq %xmm3, 0x30(%rdi)
- movntdq %xmm4, 0x40(%rdi)
- movntdq %xmm5, 0x50(%rdi)
- movntdq %xmm6, 0x60(%rdi)
- movntdq %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
- jae L(large_page_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_less_64bytes)
-
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- lea 0x40(%rsi), %rsi
-
- movntdq %xmm0, (%rdi)
- movntdq %xmm1, 0x10(%rdi)
- movntdq %xmm2, 0x20(%rdi)
- movntdq %xmm3, 0x30(%rdi)
- lea 0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_less_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- sfence
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
- .p2align 4
-L(ll_cache_copy_fwd_start):
- prefetcht0 0x1c0(%rsi)
- prefetcht0 0x200(%rsi)
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movaps %xmm0, (%rdi)
- movaps %xmm1, 0x10(%rdi)
- movaps %xmm2, 0x20(%rdi)
- movaps %xmm3, 0x30(%rdi)
- movaps %xmm4, 0x40(%rdi)
- movaps %xmm5, 0x50(%rdi)
- movaps %xmm6, 0x60(%rdi)
- movaps %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
- jae L(ll_cache_copy_fwd_start)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_ll_less_fwd_64bytes)
-
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- lea 0x40(%rsi), %rsi
-
- movaps %xmm0, (%rdi)
- movaps %xmm1, 0x10(%rdi)
- movaps %xmm2, 0x20(%rdi)
- movaps %xmm3, 0x30(%rdi)
- lea 0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_ll_less_fwd_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#endif
- .p2align 4
-L(large_page_bwd):
- movdqu -0x10(%rsi), %xmm1
- lea -16(%rsi), %rsi
- movdqu %xmm0, (%r8)
- movdqa %xmm1, -0x10(%rdi)
- lea -16(%rdi), %rdi
- lea -0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
- mov %rdi, %r9
- sub %rsi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_bwd)
- cmp %rcx, %r9
- jb L(ll_cache_copy_bwd_start)
-L(memmove_is_memcpy_bwd):
-#endif
-L(large_page_bwd_loop):
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- movdqu -0x50(%rsi), %xmm4
- movdqu -0x60(%rsi), %xmm5
- movdqu -0x70(%rsi), %xmm6
- movdqu -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movntdq %xmm0, -0x10(%rdi)
- movntdq %xmm1, -0x20(%rdi)
- movntdq %xmm2, -0x30(%rdi)
- movntdq %xmm3, -0x40(%rdi)
- movntdq %xmm4, -0x50(%rdi)
- movntdq %xmm5, -0x60(%rdi)
- movntdq %xmm6, -0x70(%rdi)
- movntdq %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
- jae L(large_page_bwd_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_less_bwd_64bytes)
-
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- lea -0x40(%rsi), %rsi
-
- movntdq %xmm0, -0x10(%rdi)
- movntdq %xmm1, -0x20(%rdi)
- movntdq %xmm2, -0x30(%rdi)
- movntdq %xmm3, -0x40(%rdi)
- lea -0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_less_bwd_64bytes):
- sfence
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
- .p2align 4
-L(ll_cache_copy_bwd_start):
- prefetcht0 -0x1c0(%rsi)
- prefetcht0 -0x200(%rsi)
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- movdqu -0x50(%rsi), %xmm4
- movdqu -0x60(%rsi), %xmm5
- movdqu -0x70(%rsi), %xmm6
- movdqu -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movaps %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
- movaps %xmm4, -0x50(%rdi)
- movaps %xmm5, -0x60(%rdi)
- movaps %xmm6, -0x70(%rdi)
- movaps %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
- jae L(ll_cache_copy_bwd_start)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_ll_less_bwd_64bytes)
-
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- lea -0x40(%rsi), %rsi
-
- movaps %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
- lea -0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_ll_less_bwd_64bytes):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-#endif
-
-END (MEMCPY)
-
- .section .rodata.ssse3,"a",@progbits
- .p2align 3
-L(table_less_80bytes):
- .int JMPTBL (L(write_0bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_1bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_2bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_3bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_4bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_5bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_6bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_7bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_8bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_9bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_10bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_11bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_12bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_13bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_14bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_15bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_16bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_17bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_18bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_19bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_20bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_21bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_22bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_23bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_24bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_25bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_26bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_27bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_28bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_29bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_30bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_31bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_32bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_33bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_34bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_35bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_36bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_37bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_38bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_39bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_40bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_41bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_42bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_43bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_44bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_45bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_46bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_47bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_48bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_49bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_50bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_51bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_52bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_53bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_54bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_55bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_56bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_57bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_58bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_59bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_60bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_61bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_62bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_63bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_64bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_65bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_66bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_67bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_68bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_69bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_70bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_71bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_72bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_73bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_74bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_75bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_76bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_77bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_78bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_79bytes), L(table_less_80bytes))
-
- .p2align 3
-L(shl_table):
- .int JMPTBL (L(shl_0), L(shl_table))
- .int JMPTBL (L(shl_1), L(shl_table))
- .int JMPTBL (L(shl_2), L(shl_table))
- .int JMPTBL (L(shl_3), L(shl_table))
- .int JMPTBL (L(shl_4), L(shl_table))
- .int JMPTBL (L(shl_5), L(shl_table))
- .int JMPTBL (L(shl_6), L(shl_table))
- .int JMPTBL (L(shl_7), L(shl_table))
- .int JMPTBL (L(shl_8), L(shl_table))
- .int JMPTBL (L(shl_9), L(shl_table))
- .int JMPTBL (L(shl_10), L(shl_table))
- .int JMPTBL (L(shl_11), L(shl_table))
- .int JMPTBL (L(shl_12), L(shl_table))
- .int JMPTBL (L(shl_13), L(shl_table))
- .int JMPTBL (L(shl_14), L(shl_table))
- .int JMPTBL (L(shl_15), L(shl_table))
-
- .p2align 3
-L(shl_table_bwd):
- .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
deleted file mode 100644
index af2770397c..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Multiple versions of memcpy
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in lib and for
- DSO. In static binaries we need memcpy before the initialization
- happened. */
-#if defined SHARED && IS_IN (libc)
- .text
-ENTRY(__new_memcpy)
- .type __new_memcpy, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- lea __memcpy_erms(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Prefer_ERMS)
- jnz 2f
- HAS_ARCH_FEATURE (Prefer_No_AVX512)
- jnz 1f
- HAS_ARCH_FEATURE (AVX512F_Usable)
- jz 1f
- lea __memcpy_avx512_no_vzeroupper(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
- jnz 2f
- lea __memcpy_avx512_unaligned_erms(%rip), %RAX_LP
- HAS_CPU_FEATURE (ERMS)
- jnz 2f
- lea __memcpy_avx512_unaligned(%rip), %RAX_LP
- ret
-1: lea __memcpy_avx_unaligned(%rip), %RAX_LP
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz L(Fast_Unaligned_Load)
- HAS_CPU_FEATURE (ERMS)
- jz 2f
- lea __memcpy_avx_unaligned_erms(%rip), %RAX_LP
- ret
-L(Fast_Unaligned_Load):
- lea __memcpy_sse2_unaligned(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
- jz L(SSSE3)
- HAS_CPU_FEATURE (ERMS)
- jz 2f
- lea __memcpy_sse2_unaligned_erms(%rip), %RAX_LP
- ret
-L(SSSE3):
- HAS_CPU_FEATURE (SSSE3)
- jz 2f
- lea __memcpy_ssse3_back(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jnz 2f
- lea __memcpy_ssse3(%rip), %RAX_LP
-2: ret
-END(__new_memcpy)
-
-# undef memcpy
-# include <shlib-compat.h>
-versioned_symbol (libc, __new_memcpy, memcpy, GLIBC_2_14);
-#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S
deleted file mode 100644
index 8737fb9755..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy_chk.S
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Multiple versions of __memcpy_chk
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in lib and for
- DSO. There are no multiarch memcpy functions for static binaries.
- */
-#if IS_IN (libc)
-# ifdef SHARED
- .text
-ENTRY(__memcpy_chk)
- .type __memcpy_chk, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- HAS_ARCH_FEATURE (Prefer_No_AVX512)
- jnz 1f
- HAS_ARCH_FEATURE (AVX512F_Usable)
- jz 1f
- lea __memcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
- jnz 2f
- lea __memcpy_chk_avx512_unaligned_erms(%rip), %RAX_LP
- HAS_CPU_FEATURE (ERMS)
- jnz 2f
- lea __memcpy_chk_avx512_unaligned(%rip), %RAX_LP
- ret
-1: lea __memcpy_chk_avx_unaligned(%rip), %RAX_LP
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz L(Fast_Unaligned_Load)
- HAS_CPU_FEATURE (ERMS)
- jz 2f
- lea __memcpy_chk_avx_unaligned_erms(%rip), %RAX_LP
- ret
-L(Fast_Unaligned_Load):
- lea __memcpy_chk_sse2_unaligned(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
- jz L(SSSE3)
- HAS_CPU_FEATURE (ERMS)
- jz 2f
- lea __memcpy_chk_sse2_unaligned_erms(%rip), %RAX_LP
- ret
-L(SSSE3):
- HAS_CPU_FEATURE (SSSE3)
- jz 2f
- lea __memcpy_chk_ssse3_back(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jnz 2f
- lea __memcpy_chk_ssse3(%rip), %RAX_LP
-2: ret
-END(__memcpy_chk)
-# else
-# include "../memcpy_chk.S"
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
deleted file mode 100644
index e195e93f15..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+++ /dev/null
@@ -1,12 +0,0 @@
-#if IS_IN (libc)
-# define VEC_SIZE 32
-# define VEC(i) ymm##i
-# define VMOVNT vmovntdq
-# define VMOVU vmovdqu
-# define VMOVA vmovdqa
-
-# define SECTION(p) p##.avx
-# define MEMMOVE_SYMBOL(p,s) p##_avx_##s
-
-# include "memmove-vec-unaligned-erms.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
deleted file mode 100644
index f3ef10577c..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
+++ /dev/null
@@ -1,420 +0,0 @@
-/* memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware.
- Copyright (C) 2016-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-# include "asm-syntax.h"
-
- .section .text.avx512,"ax",@progbits
-# if defined SHARED && !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (__mempcpy_chk_avx512_no_vzeroupper)
-
-ENTRY (__mempcpy_avx512_no_vzeroupper)
- movq %rdi, %rax
- addq %rdx, %rax
- jmp L(start)
-END (__mempcpy_avx512_no_vzeroupper)
-# endif
-
-# ifdef SHARED
-ENTRY (__memmove_chk_avx512_no_vzeroupper)
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (__memmove_chk_avx512_no_vzeroupper)
-# endif
-
-ENTRY (__memmove_avx512_no_vzeroupper)
- mov %rdi, %rax
-# ifdef USE_AS_MEMPCPY
- add %rdx, %rax
-# endif
-L(start):
- lea (%rsi, %rdx), %rcx
- lea (%rdi, %rdx), %r9
- cmp $512, %rdx
- ja L(512bytesormore)
-
-L(check):
- cmp $16, %rdx
- jbe L(less_16bytes)
- cmp $256, %rdx
- jb L(less_256bytes)
- vmovups (%rsi), %zmm0
- vmovups 0x40(%rsi), %zmm1
- vmovups 0x80(%rsi), %zmm2
- vmovups 0xC0(%rsi), %zmm3
- vmovups -0x100(%rcx), %zmm4
- vmovups -0xC0(%rcx), %zmm5
- vmovups -0x80(%rcx), %zmm6
- vmovups -0x40(%rcx), %zmm7
- vmovups %zmm0, (%rdi)
- vmovups %zmm1, 0x40(%rdi)
- vmovups %zmm2, 0x80(%rdi)
- vmovups %zmm3, 0xC0(%rdi)
- vmovups %zmm4, -0x100(%r9)
- vmovups %zmm5, -0xC0(%r9)
- vmovups %zmm6, -0x80(%r9)
- vmovups %zmm7, -0x40(%r9)
- ret
-
-L(less_256bytes):
- cmp $128, %dl
- jb L(less_128bytes)
- vmovups (%rsi), %zmm0
- vmovups 0x40(%rsi), %zmm1
- vmovups -0x80(%rcx), %zmm2
- vmovups -0x40(%rcx), %zmm3
- vmovups %zmm0, (%rdi)
- vmovups %zmm1, 0x40(%rdi)
- vmovups %zmm2, -0x80(%r9)
- vmovups %zmm3, -0x40(%r9)
- ret
-
-L(less_128bytes):
- cmp $64, %dl
- jb L(less_64bytes)
- vmovdqu (%rsi), %ymm0
- vmovdqu 0x20(%rsi), %ymm1
- vmovdqu -0x40(%rcx), %ymm2
- vmovdqu -0x20(%rcx), %ymm3
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, 0x20(%rdi)
- vmovdqu %ymm2, -0x40(%r9)
- vmovdqu %ymm3, -0x20(%r9)
- ret
-
-L(less_64bytes):
- cmp $32, %dl
- jb L(less_32bytes)
- vmovdqu (%rsi), %ymm0
- vmovdqu -0x20(%rcx), %ymm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, -0x20(%r9)
- ret
-
-L(less_32bytes):
- vmovdqu (%rsi), %xmm0
- vmovdqu -0x10(%rcx), %xmm1
- vmovdqu %xmm0, (%rdi)
- vmovdqu %xmm1, -0x10(%r9)
- ret
-
-L(less_16bytes):
- cmp $8, %dl
- jb L(less_8bytes)
- movq (%rsi), %rsi
- movq -0x8(%rcx), %rcx
- movq %rsi, (%rdi)
- movq %rcx, -0x8(%r9)
- ret
-
-L(less_8bytes):
- cmp $4, %dl
- jb L(less_4bytes)
- mov (%rsi), %esi
- mov -0x4(%rcx), %ecx
- mov %esi, (%rdi)
- mov %ecx, -0x4(%r9)
- ret
-
-L(less_4bytes):
- cmp $2, %dl
- jb L(less_2bytes)
- mov (%rsi), %si
- mov -0x2(%rcx), %cx
- mov %si, (%rdi)
- mov %cx, -0x2(%r9)
- ret
-
-L(less_2bytes):
- cmp $1, %dl
- jb L(less_1bytes)
- mov (%rsi), %cl
- mov %cl, (%rdi)
-L(less_1bytes):
- ret
-
-L(512bytesormore):
-# ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %r8
-# else
- mov __x86_shared_cache_size_half(%rip), %r8
-# endif
- cmp %r8, %rdx
- jae L(preloop_large)
- cmp $1024, %rdx
- ja L(1024bytesormore)
- prefetcht1 (%rsi)
- prefetcht1 0x40(%rsi)
- prefetcht1 0x80(%rsi)
- prefetcht1 0xC0(%rsi)
- prefetcht1 0x100(%rsi)
- prefetcht1 0x140(%rsi)
- prefetcht1 0x180(%rsi)
- prefetcht1 0x1C0(%rsi)
- prefetcht1 -0x200(%rcx)
- prefetcht1 -0x1C0(%rcx)
- prefetcht1 -0x180(%rcx)
- prefetcht1 -0x140(%rcx)
- prefetcht1 -0x100(%rcx)
- prefetcht1 -0xC0(%rcx)
- prefetcht1 -0x80(%rcx)
- prefetcht1 -0x40(%rcx)
- vmovups (%rsi), %zmm0
- vmovups 0x40(%rsi), %zmm1
- vmovups 0x80(%rsi), %zmm2
- vmovups 0xC0(%rsi), %zmm3
- vmovups 0x100(%rsi), %zmm4
- vmovups 0x140(%rsi), %zmm5
- vmovups 0x180(%rsi), %zmm6
- vmovups 0x1C0(%rsi), %zmm7
- vmovups -0x200(%rcx), %zmm8
- vmovups -0x1C0(%rcx), %zmm9
- vmovups -0x180(%rcx), %zmm10
- vmovups -0x140(%rcx), %zmm11
- vmovups -0x100(%rcx), %zmm12
- vmovups -0xC0(%rcx), %zmm13
- vmovups -0x80(%rcx), %zmm14
- vmovups -0x40(%rcx), %zmm15
- vmovups %zmm0, (%rdi)
- vmovups %zmm1, 0x40(%rdi)
- vmovups %zmm2, 0x80(%rdi)
- vmovups %zmm3, 0xC0(%rdi)
- vmovups %zmm4, 0x100(%rdi)
- vmovups %zmm5, 0x140(%rdi)
- vmovups %zmm6, 0x180(%rdi)
- vmovups %zmm7, 0x1C0(%rdi)
- vmovups %zmm8, -0x200(%r9)
- vmovups %zmm9, -0x1C0(%r9)
- vmovups %zmm10, -0x180(%r9)
- vmovups %zmm11, -0x140(%r9)
- vmovups %zmm12, -0x100(%r9)
- vmovups %zmm13, -0xC0(%r9)
- vmovups %zmm14, -0x80(%r9)
- vmovups %zmm15, -0x40(%r9)
- ret
-
-L(1024bytesormore):
- cmp %rsi, %rdi
- ja L(1024bytesormore_bkw)
- sub $512, %r9
- vmovups -0x200(%rcx), %zmm8
- vmovups -0x1C0(%rcx), %zmm9
- vmovups -0x180(%rcx), %zmm10
- vmovups -0x140(%rcx), %zmm11
- vmovups -0x100(%rcx), %zmm12
- vmovups -0xC0(%rcx), %zmm13
- vmovups -0x80(%rcx), %zmm14
- vmovups -0x40(%rcx), %zmm15
- prefetcht1 (%rsi)
- prefetcht1 0x40(%rsi)
- prefetcht1 0x80(%rsi)
- prefetcht1 0xC0(%rsi)
- prefetcht1 0x100(%rsi)
- prefetcht1 0x140(%rsi)
- prefetcht1 0x180(%rsi)
- prefetcht1 0x1C0(%rsi)
-
-/* Loop with unaligned memory access. */
-L(gobble_512bytes_loop):
- vmovups (%rsi), %zmm0
- vmovups 0x40(%rsi), %zmm1
- vmovups 0x80(%rsi), %zmm2
- vmovups 0xC0(%rsi), %zmm3
- vmovups 0x100(%rsi), %zmm4
- vmovups 0x140(%rsi), %zmm5
- vmovups 0x180(%rsi), %zmm6
- vmovups 0x1C0(%rsi), %zmm7
- add $512, %rsi
- prefetcht1 (%rsi)
- prefetcht1 0x40(%rsi)
- prefetcht1 0x80(%rsi)
- prefetcht1 0xC0(%rsi)
- prefetcht1 0x100(%rsi)
- prefetcht1 0x140(%rsi)
- prefetcht1 0x180(%rsi)
- prefetcht1 0x1C0(%rsi)
- vmovups %zmm0, (%rdi)
- vmovups %zmm1, 0x40(%rdi)
- vmovups %zmm2, 0x80(%rdi)
- vmovups %zmm3, 0xC0(%rdi)
- vmovups %zmm4, 0x100(%rdi)
- vmovups %zmm5, 0x140(%rdi)
- vmovups %zmm6, 0x180(%rdi)
- vmovups %zmm7, 0x1C0(%rdi)
- add $512, %rdi
- cmp %r9, %rdi
- jb L(gobble_512bytes_loop)
- vmovups %zmm8, (%r9)
- vmovups %zmm9, 0x40(%r9)
- vmovups %zmm10, 0x80(%r9)
- vmovups %zmm11, 0xC0(%r9)
- vmovups %zmm12, 0x100(%r9)
- vmovups %zmm13, 0x140(%r9)
- vmovups %zmm14, 0x180(%r9)
- vmovups %zmm15, 0x1C0(%r9)
- ret
-
-L(1024bytesormore_bkw):
- add $512, %rdi
- vmovups 0x1C0(%rsi), %zmm8
- vmovups 0x180(%rsi), %zmm9
- vmovups 0x140(%rsi), %zmm10
- vmovups 0x100(%rsi), %zmm11
- vmovups 0xC0(%rsi), %zmm12
- vmovups 0x80(%rsi), %zmm13
- vmovups 0x40(%rsi), %zmm14
- vmovups (%rsi), %zmm15
- prefetcht1 -0x40(%rcx)
- prefetcht1 -0x80(%rcx)
- prefetcht1 -0xC0(%rcx)
- prefetcht1 -0x100(%rcx)
- prefetcht1 -0x140(%rcx)
- prefetcht1 -0x180(%rcx)
- prefetcht1 -0x1C0(%rcx)
- prefetcht1 -0x200(%rcx)
-
-/* Backward loop with unaligned memory access. */
-L(gobble_512bytes_loop_bkw):
- vmovups -0x40(%rcx), %zmm0
- vmovups -0x80(%rcx), %zmm1
- vmovups -0xC0(%rcx), %zmm2
- vmovups -0x100(%rcx), %zmm3
- vmovups -0x140(%rcx), %zmm4
- vmovups -0x180(%rcx), %zmm5
- vmovups -0x1C0(%rcx), %zmm6
- vmovups -0x200(%rcx), %zmm7
- sub $512, %rcx
- prefetcht1 -0x40(%rcx)
- prefetcht1 -0x80(%rcx)
- prefetcht1 -0xC0(%rcx)
- prefetcht1 -0x100(%rcx)
- prefetcht1 -0x140(%rcx)
- prefetcht1 -0x180(%rcx)
- prefetcht1 -0x1C0(%rcx)
- prefetcht1 -0x200(%rcx)
- vmovups %zmm0, -0x40(%r9)
- vmovups %zmm1, -0x80(%r9)
- vmovups %zmm2, -0xC0(%r9)
- vmovups %zmm3, -0x100(%r9)
- vmovups %zmm4, -0x140(%r9)
- vmovups %zmm5, -0x180(%r9)
- vmovups %zmm6, -0x1C0(%r9)
- vmovups %zmm7, -0x200(%r9)
- sub $512, %r9
- cmp %rdi, %r9
- ja L(gobble_512bytes_loop_bkw)
- vmovups %zmm8, -0x40(%rdi)
- vmovups %zmm9, -0x80(%rdi)
- vmovups %zmm10, -0xC0(%rdi)
- vmovups %zmm11, -0x100(%rdi)
- vmovups %zmm12, -0x140(%rdi)
- vmovups %zmm13, -0x180(%rdi)
- vmovups %zmm14, -0x1C0(%rdi)
- vmovups %zmm15, -0x200(%rdi)
- ret
-
-L(preloop_large):
- cmp %rsi, %rdi
- ja L(preloop_large_bkw)
- vmovups (%rsi), %zmm4
- vmovups 0x40(%rsi), %zmm5
-
-/* Align destination for access with non-temporal stores in the loop. */
- mov %rdi, %r8
- and $-0x80, %rdi
- add $0x80, %rdi
- sub %rdi, %r8
- sub %r8, %rsi
- add %r8, %rdx
-L(gobble_256bytes_nt_loop):
- prefetcht1 0x200(%rsi)
- prefetcht1 0x240(%rsi)
- prefetcht1 0x280(%rsi)
- prefetcht1 0x2C0(%rsi)
- prefetcht1 0x300(%rsi)
- prefetcht1 0x340(%rsi)
- prefetcht1 0x380(%rsi)
- prefetcht1 0x3C0(%rsi)
- vmovdqu64 (%rsi), %zmm0
- vmovdqu64 0x40(%rsi), %zmm1
- vmovdqu64 0x80(%rsi), %zmm2
- vmovdqu64 0xC0(%rsi), %zmm3
- vmovntdq %zmm0, (%rdi)
- vmovntdq %zmm1, 0x40(%rdi)
- vmovntdq %zmm2, 0x80(%rdi)
- vmovntdq %zmm3, 0xC0(%rdi)
- sub $256, %rdx
- add $256, %rsi
- add $256, %rdi
- cmp $256, %rdx
- ja L(gobble_256bytes_nt_loop)
- sfence
- vmovups %zmm4, (%rax)
- vmovups %zmm5, 0x40(%rax)
- jmp L(check)
-
-L(preloop_large_bkw):
- vmovups -0x80(%rcx), %zmm4
- vmovups -0x40(%rcx), %zmm5
-
-/* Align end of destination for access with non-temporal stores. */
- mov %r9, %r8
- and $-0x80, %r9
- sub %r9, %r8
- sub %r8, %rcx
- sub %r8, %rdx
- add %r9, %r8
-L(gobble_256bytes_nt_loop_bkw):
- prefetcht1 -0x400(%rcx)
- prefetcht1 -0x3C0(%rcx)
- prefetcht1 -0x380(%rcx)
- prefetcht1 -0x340(%rcx)
- prefetcht1 -0x300(%rcx)
- prefetcht1 -0x2C0(%rcx)
- prefetcht1 -0x280(%rcx)
- prefetcht1 -0x240(%rcx)
- vmovdqu64 -0x100(%rcx), %zmm0
- vmovdqu64 -0xC0(%rcx), %zmm1
- vmovdqu64 -0x80(%rcx), %zmm2
- vmovdqu64 -0x40(%rcx), %zmm3
- vmovntdq %zmm0, -0x100(%r9)
- vmovntdq %zmm1, -0xC0(%r9)
- vmovntdq %zmm2, -0x80(%r9)
- vmovntdq %zmm3, -0x40(%r9)
- sub $256, %rdx
- sub $256, %rcx
- sub $256, %r9
- cmp $256, %rdx
- ja L(gobble_256bytes_nt_loop_bkw)
- sfence
- vmovups %zmm4, -0x80(%r8)
- vmovups %zmm5, -0x40(%r8)
- jmp L(check)
-END (__memmove_avx512_no_vzeroupper)
-
-# ifdef SHARED
-strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper)
-strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper)
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
deleted file mode 100644
index aac1515cf6..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ /dev/null
@@ -1,12 +0,0 @@
-#if IS_IN (libc)
-# define VEC_SIZE 64
-# define VEC(i) zmm##i
-# define VMOVNT vmovntdq
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
-
-# define SECTION(p) p##.avx512
-# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s
-
-# include "memmove-vec-unaligned-erms.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
deleted file mode 100644
index f9a4e9aff9..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY __memmove_ssse3_back
-#define MEMCPY_CHK __memmove_chk_ssse3_back
-#include "memcpy-ssse3-back.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
deleted file mode 100644
index 295430b1ef..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY __memmove_ssse3
-#define MEMCPY_CHK __memmove_chk_ssse3
-#include "memcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
deleted file mode 100644
index dee3ec529c..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ /dev/null
@@ -1,553 +0,0 @@
-/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
- Copyright (C) 2016-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-/* memmove/memcpy/mempcpy is implemented as:
- 1. Use overlapping load and store to avoid branch.
- 2. Load all sources into registers and store them together to avoid
- possible address overlap between source and destination.
- 3. If size is 8 * VEC_SIZE or less, load all sources into registers
- and store them together.
- 4. If address of destination > address of source, backward copy
- 4 * VEC_SIZE at a time with unaligned load and aligned store.
- Load the first 4 * VEC and last VEC before the loop and store
- them after the loop to support overlapping addresses.
- 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
- load and aligned store. Load the last 4 * VEC and first VEC
- before the loop and store them after the loop to support
- overlapping addresses.
- 6. If size >= __x86_shared_non_temporal_threshold and there is no
- overlap between destination and source, use non-temporal store
- instead of aligned store. */
-
-#include <sysdep.h>
-
-#ifndef MEMCPY_SYMBOL
-# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
-#endif
-
-#ifndef MEMPCPY_SYMBOL
-# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
-#endif
-
-#ifndef MEMMOVE_CHK_SYMBOL
-# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
-#endif
-
-#ifndef VZEROUPPER
-# if VEC_SIZE > 16
-# define VZEROUPPER vzeroupper
-# else
-# define VZEROUPPER
-# endif
-#endif
-
-/* Threshold to use Enhanced REP MOVSB. Since there is overhead to set
- up REP MOVSB operation, REP MOVSB isn't faster on short data. The
- memcpy micro benchmark in glibc shows that 2KB is the approximate
- value above which REP MOVSB becomes faster than SSE2 optimization
- on processors with Enhanced REP MOVSB. Since larger register size
- can move more data with a single load and store, the threshold is
- higher with larger register size. */
-#ifndef REP_MOVSB_THRESHOLD
-# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
-#endif
-
-#ifndef PREFETCH
-# define PREFETCH(addr) prefetcht0 addr
-#endif
-
-/* Assume 64-byte prefetch size. */
-#ifndef PREFETCH_SIZE
-# define PREFETCH_SIZE 64
-#endif
-
-#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
-
-#if PREFETCH_SIZE == 64
-# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
-# define PREFETCH_ONE_SET(dir, base, offset) \
- PREFETCH ((offset)base)
-# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
-# define PREFETCH_ONE_SET(dir, base, offset) \
- PREFETCH ((offset)base); \
- PREFETCH ((offset + dir * PREFETCH_SIZE)base)
-# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
-# define PREFETCH_ONE_SET(dir, base, offset) \
- PREFETCH ((offset)base); \
- PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
- PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
- PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
- PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
-# else
-# error Unsupported PREFETCHED_LOAD_SIZE!
-# endif
-#else
-# error Unsupported PREFETCH_SIZE!
-#endif
-
-#ifndef SECTION
-# error SECTION is not defined!
-#endif
-
- .section SECTION(.text),"ax",@progbits
-#if defined SHARED && IS_IN (libc)
-ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
-#endif
-
-#if VEC_SIZE == 16 || defined SHARED
-ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
- movq %rdi, %rax
- addq %rdx, %rax
- jmp L(start)
-END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
-#endif
-
-#if defined SHARED && IS_IN (libc)
-ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
-#endif
-
-ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
- movq %rdi, %rax
-L(start):
- cmpq $VEC_SIZE, %rdx
- jb L(less_vec)
- cmpq $(VEC_SIZE * 2), %rdx
- ja L(more_2x_vec)
-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(last_2x_vec):
-#endif
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
- VMOVU (%rsi), %VEC(0)
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
- VMOVU %VEC(0), (%rdi)
- VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
- VZEROUPPER
-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(nop):
-#endif
- ret
-#if defined USE_MULTIARCH && IS_IN (libc)
-END (MEMMOVE_SYMBOL (__memmove, unaligned))
-
-# if VEC_SIZE == 16
-# if defined SHARED
-/* Only used to measure performance of REP MOVSB. */
-ENTRY (__mempcpy_erms)
- movq %rdi, %rax
- addq %rdx, %rax
- jmp L(start_movsb)
-END (__mempcpy_erms)
-# endif
-
-ENTRY (__memmove_erms)
- movq %rdi, %rax
-L(start_movsb):
- movq %rdx, %rcx
- cmpq %rsi, %rdi
- jb 1f
- /* Source == destination is less common. */
- je 2f
- leaq (%rsi,%rcx), %rdx
- cmpq %rdx, %rdi
- jb L(movsb_backward)
-1:
- rep movsb
-2:
- ret
-L(movsb_backward):
- leaq -1(%rdi,%rcx), %rdi
- leaq -1(%rsi,%rcx), %rsi
- std
- rep movsb
- cld
- ret
-END (__memmove_erms)
-# if defined SHARED
-strong_alias (__memmove_erms, __memcpy_erms)
-# endif
-# endif
-
-# ifdef SHARED
-ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
-
-ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
- movq %rdi, %rax
- addq %rdx, %rax
- jmp L(start_erms)
-END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
-
-ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
-# endif
-
-ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
- movq %rdi, %rax
-L(start_erms):
- cmpq $VEC_SIZE, %rdx
- jb L(less_vec)
- cmpq $(VEC_SIZE * 2), %rdx
- ja L(movsb_more_2x_vec)
-L(last_2x_vec):
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
- VMOVU (%rsi), %VEC(0)
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
- VMOVU %VEC(0), (%rdi)
- VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
-L(return):
- VZEROUPPER
- ret
-
-L(movsb):
- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
- jae L(more_8x_vec)
- cmpq %rsi, %rdi
- jb 1f
- /* Source == destination is less common. */
- je L(nop)
- leaq (%rsi,%rdx), %r9
- cmpq %r9, %rdi
- /* Avoid slow backward REP MOVSB. */
-# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
-# error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
-# endif
- jb L(more_8x_vec_backward)
-1:
- movq %rdx, %rcx
- rep movsb
-L(nop):
- ret
-#endif
-
-L(less_vec):
- /* Less than 1 VEC. */
-#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
-# error Unsupported VEC_SIZE!
-#endif
-#if VEC_SIZE > 32
- cmpb $32, %dl
- jae L(between_32_63)
-#endif
-#if VEC_SIZE > 16
- cmpb $16, %dl
- jae L(between_16_31)
-#endif
- cmpb $8, %dl
- jae L(between_8_15)
- cmpb $4, %dl
- jae L(between_4_7)
- cmpb $1, %dl
- ja L(between_2_3)
- jb 1f
- movzbl (%rsi), %ecx
- movb %cl, (%rdi)
-1:
- ret
-#if VEC_SIZE > 32
-L(between_32_63):
- /* From 32 to 63. No branch when size == 32. */
- vmovdqu (%rsi), %ymm0
- vmovdqu -32(%rsi,%rdx), %ymm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, -32(%rdi,%rdx)
- VZEROUPPER
- ret
-#endif
-#if VEC_SIZE > 16
- /* From 16 to 31. No branch when size == 16. */
-L(between_16_31):
- vmovdqu (%rsi), %xmm0
- vmovdqu -16(%rsi,%rdx), %xmm1
- vmovdqu %xmm0, (%rdi)
- vmovdqu %xmm1, -16(%rdi,%rdx)
- ret
-#endif
-L(between_8_15):
- /* From 8 to 15. No branch when size == 8. */
- movq -8(%rsi,%rdx), %rcx
- movq (%rsi), %rsi
- movq %rcx, -8(%rdi,%rdx)
- movq %rsi, (%rdi)
- ret
-L(between_4_7):
- /* From 4 to 7. No branch when size == 4. */
- movl -4(%rsi,%rdx), %ecx
- movl (%rsi), %esi
- movl %ecx, -4(%rdi,%rdx)
- movl %esi, (%rdi)
- ret
-L(between_2_3):
- /* From 2 to 3. No branch when size == 2. */
- movzwl -2(%rsi,%rdx), %ecx
- movzwl (%rsi), %esi
- movw %cx, -2(%rdi,%rdx)
- movw %si, (%rdi)
- ret
-
-#if defined USE_MULTIARCH && IS_IN (libc)
-L(movsb_more_2x_vec):
- cmpq $REP_MOVSB_THRESHOLD, %rdx
- ja L(movsb)
-#endif
-L(more_2x_vec):
- /* More than 2 * VEC and there may be overlap between destination
- and source. */
- cmpq $(VEC_SIZE * 8), %rdx
- ja L(more_8x_vec)
- cmpq $(VEC_SIZE * 4), %rdx
- jb L(last_4x_vec)
- /* Copy from 4 * VEC to 8 * VEC, inclusively. */
- VMOVU (%rsi), %VEC(0)
- VMOVU VEC_SIZE(%rsi), %VEC(1)
- VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
- VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
- VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
- VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
- VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
- VMOVU %VEC(0), (%rdi)
- VMOVU %VEC(1), VEC_SIZE(%rdi)
- VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
- VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
- VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
- VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
- VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
- VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
- VZEROUPPER
- ret
-L(last_4x_vec):
- /* Copy from 2 * VEC to 4 * VEC. */
- VMOVU (%rsi), %VEC(0)
- VMOVU VEC_SIZE(%rsi), %VEC(1)
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
- VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
- VMOVU %VEC(0), (%rdi)
- VMOVU %VEC(1), VEC_SIZE(%rdi)
- VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
- VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
- VZEROUPPER
- ret
-
-L(more_8x_vec):
- cmpq %rsi, %rdi
- ja L(more_8x_vec_backward)
- /* Source == destination is less common. */
- je L(nop)
- /* Load the first VEC and last 4 * VEC to support overlapping
- addresses. */
- VMOVU (%rsi), %VEC(4)
- VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
- VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
- VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
- VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
- /* Save start and stop of the destination buffer. */
- movq %rdi, %r11
- leaq -VEC_SIZE(%rdi, %rdx), %rcx
- /* Align destination for aligned stores in the loop. Compute
- how much destination is misaligned. */
- movq %rdi, %r8
- andq $(VEC_SIZE - 1), %r8
- /* Get the negative of offset for alignment. */
- subq $VEC_SIZE, %r8
- /* Adjust source. */
- subq %r8, %rsi
- /* Adjust destination which should be aligned now. */
- subq %r8, %rdi
- /* Adjust length. */
- addq %r8, %rdx
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
- /* Check non-temporal store threshold. */
- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
- ja L(large_forward)
-#endif
-L(loop_4x_vec_forward):
- /* Copy 4 * VEC a time forward. */
- VMOVU (%rsi), %VEC(0)
- VMOVU VEC_SIZE(%rsi), %VEC(1)
- VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
- VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
- addq $(VEC_SIZE * 4), %rsi
- subq $(VEC_SIZE * 4), %rdx
- VMOVA %VEC(0), (%rdi)
- VMOVA %VEC(1), VEC_SIZE(%rdi)
- VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
- VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
- addq $(VEC_SIZE * 4), %rdi
- cmpq $(VEC_SIZE * 4), %rdx
- ja L(loop_4x_vec_forward)
- /* Store the last 4 * VEC. */
- VMOVU %VEC(5), (%rcx)
- VMOVU %VEC(6), -VEC_SIZE(%rcx)
- VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
- VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
- /* Store the first VEC. */
- VMOVU %VEC(4), (%r11)
- VZEROUPPER
- ret
-
-L(more_8x_vec_backward):
- /* Load the first 4 * VEC and last VEC to support overlapping
- addresses. */
- VMOVU (%rsi), %VEC(4)
- VMOVU VEC_SIZE(%rsi), %VEC(5)
- VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
- VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
- /* Save stop of the destination buffer. */
- leaq -VEC_SIZE(%rdi, %rdx), %r11
- /* Align destination end for aligned stores in the loop. Compute
- how much destination end is misaligned. */
- leaq -VEC_SIZE(%rsi, %rdx), %rcx
- movq %r11, %r9
- movq %r11, %r8
- andq $(VEC_SIZE - 1), %r8
- /* Adjust source. */
- subq %r8, %rcx
- /* Adjust the end of destination which should be aligned now. */
- subq %r8, %r9
- /* Adjust length. */
- subq %r8, %rdx
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
- /* Check non-temporal store threshold. */
- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
- ja L(large_backward)
-#endif
-L(loop_4x_vec_backward):
- /* Copy 4 * VEC a time backward. */
- VMOVU (%rcx), %VEC(0)
- VMOVU -VEC_SIZE(%rcx), %VEC(1)
- VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
- VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
- subq $(VEC_SIZE * 4), %rcx
- subq $(VEC_SIZE * 4), %rdx
- VMOVA %VEC(0), (%r9)
- VMOVA %VEC(1), -VEC_SIZE(%r9)
- VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
- VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
- subq $(VEC_SIZE * 4), %r9
- cmpq $(VEC_SIZE * 4), %rdx
- ja L(loop_4x_vec_backward)
- /* Store the first 4 * VEC. */
- VMOVU %VEC(4), (%rdi)
- VMOVU %VEC(5), VEC_SIZE(%rdi)
- VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
- VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
- /* Store the last VEC. */
- VMOVU %VEC(8), (%r11)
- VZEROUPPER
- ret
-
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-L(large_forward):
- /* Don't use non-temporal store if there is overlap between
- destination and source since destination may be in cache
- when source is loaded. */
- leaq (%rdi, %rdx), %r10
- cmpq %r10, %rsi
- jb L(loop_4x_vec_forward)
-L(loop_large_forward):
- /* Copy 4 * VEC a time forward with non-temporal stores. */
- PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
- PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
- VMOVU (%rsi), %VEC(0)
- VMOVU VEC_SIZE(%rsi), %VEC(1)
- VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
- VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
- addq $PREFETCHED_LOAD_SIZE, %rsi
- subq $PREFETCHED_LOAD_SIZE, %rdx
- VMOVNT %VEC(0), (%rdi)
- VMOVNT %VEC(1), VEC_SIZE(%rdi)
- VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
- VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
- addq $PREFETCHED_LOAD_SIZE, %rdi
- cmpq $PREFETCHED_LOAD_SIZE, %rdx
- ja L(loop_large_forward)
- sfence
- /* Store the last 4 * VEC. */
- VMOVU %VEC(5), (%rcx)
- VMOVU %VEC(6), -VEC_SIZE(%rcx)
- VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
- VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
- /* Store the first VEC. */
- VMOVU %VEC(4), (%r11)
- VZEROUPPER
- ret
-
-L(large_backward):
- /* Don't use non-temporal store if there is overlap between
- destination and source since destination may be in cache
- when source is loaded. */
- leaq (%rcx, %rdx), %r10
- cmpq %r10, %r9
- jb L(loop_4x_vec_backward)
-L(loop_large_backward):
- /* Copy 4 * VEC a time backward with non-temporal stores. */
- PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
- PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
- VMOVU (%rcx), %VEC(0)
- VMOVU -VEC_SIZE(%rcx), %VEC(1)
- VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
- VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
- subq $PREFETCHED_LOAD_SIZE, %rcx
- subq $PREFETCHED_LOAD_SIZE, %rdx
- VMOVNT %VEC(0), (%r9)
- VMOVNT %VEC(1), -VEC_SIZE(%r9)
- VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
- VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
- subq $PREFETCHED_LOAD_SIZE, %r9
- cmpq $PREFETCHED_LOAD_SIZE, %rdx
- ja L(loop_large_backward)
- sfence
- /* Store the first 4 * VEC. */
- VMOVU %VEC(4), (%rdi)
- VMOVU %VEC(5), VEC_SIZE(%rdi)
- VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
- VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
- /* Store the last VEC. */
- VMOVU %VEC(8), (%r11)
- VZEROUPPER
- ret
-#endif
-END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
-
-#ifdef SHARED
-# if IS_IN (libc)
-# ifdef USE_MULTIARCH
-strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
- MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
-strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
- MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
-# endif
-strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
- MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
-# endif
-#endif
-#if VEC_SIZE == 16 || defined SHARED
-strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
- MEMCPY_SYMBOL (__memcpy, unaligned))
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove.S b/sysdeps/x86_64/multiarch/memmove.S
deleted file mode 100644
index 8c534e83e0..0000000000
--- a/sysdeps/x86_64/multiarch/memmove.S
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Multiple versions of memmove
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2016-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in lib and for
- DSO. */
-#if IS_IN (libc)
- .text
-ENTRY(__libc_memmove)
- .type __libc_memmove, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- lea __memmove_erms(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Prefer_ERMS)
- jnz 2f
- HAS_ARCH_FEATURE (Prefer_No_AVX512)
- jnz 1f
- HAS_ARCH_FEATURE (AVX512F_Usable)
- jz 1f
- lea __memmove_avx512_no_vzeroupper(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
- jnz 2f
- lea __memmove_avx512_unaligned_erms(%rip), %RAX_LP
- HAS_CPU_FEATURE (ERMS)
- jnz 2f
- lea __memmove_avx512_unaligned(%rip), %RAX_LP
- ret
-1: lea __memmove_avx_unaligned(%rip), %RAX_LP
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz L(Fast_Unaligned_Load)
- HAS_CPU_FEATURE (ERMS)
- jz 2f
- lea __memmove_avx_unaligned_erms(%rip), %RAX_LP
- ret
-L(Fast_Unaligned_Load):
- lea __memmove_sse2_unaligned(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
- jz L(SSSE3)
- HAS_CPU_FEATURE (ERMS)
- jz 2f
- lea __memmove_sse2_unaligned_erms(%rip), %RAX_LP
- ret
-L(SSSE3):
- HAS_CPU_FEATURE (SSSE3)
- jz 2f
- lea __memmove_ssse3_back(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jnz 2f
- lea __memmove_ssse3(%rip), %RAX_LP
-2: ret
-END(__libc_memmove)
-#endif
-
-#if IS_IN (libc)
-# define MEMMOVE_SYMBOL(p,s) p##_sse2_##s
-
-# ifdef SHARED
-libc_hidden_ver (__memmove_sse2_unaligned, memmove)
-libc_hidden_ver (__memcpy_sse2_unaligned, memcpy)
-libc_hidden_ver (__mempcpy_sse2_unaligned, mempcpy)
-libc_hidden_ver (__mempcpy_sse2_unaligned, __mempcpy)
-
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal memmove calls through a PLT.
- The speedup we get from using SSE2 instructions is likely eaten away
- by the indirect call in the PLT. */
-# define libc_hidden_builtin_def
-# endif
-strong_alias (__libc_memmove, memmove)
-#endif
-
-#if !defined SHARED || !IS_IN (libc)
-weak_alias (__mempcpy, mempcpy)
-#endif
-
-#include "../memmove.S"
-
-#if defined SHARED && IS_IN (libc)
-# include <shlib-compat.h>
-# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
-/* Use __memmove_sse2_unaligned to support overlapping addresses. */
-compat_symbol (libc, __memmove_sse2_unaligned, memcpy, GLIBC_2_2_5);
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove_chk.S b/sysdeps/x86_64/multiarch/memmove_chk.S
deleted file mode 100644
index 7870dd0247..0000000000
--- a/sysdeps/x86_64/multiarch/memmove_chk.S
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Multiple versions of __memmove_chk
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2016-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in lib and for
- DSO. There are no multiarch memmove functions for static binaries.
- */
-#if IS_IN (libc)
-# ifdef SHARED
- .text
-ENTRY(__memmove_chk)
- .type __memmove_chk, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- HAS_ARCH_FEATURE (Prefer_No_AVX512)
- jnz 1f
- HAS_ARCH_FEATURE (AVX512F_Usable)
- jz 1f
- lea __memmove_chk_avx512_no_vzeroupper(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
- jnz 2f
- lea __memmove_chk_avx512_unaligned_erms(%rip), %RAX_LP
- HAS_CPU_FEATURE (ERMS)
- jnz 2f
- lea __memmove_chk_avx512_unaligned(%rip), %RAX_LP
- ret
-1: lea __memmove_chk_avx_unaligned(%rip), %RAX_LP
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz L(Fast_Unaligned_Load)
- HAS_CPU_FEATURE (ERMS)
- jz 2f
- lea __memmove_chk_avx_unaligned_erms(%rip), %RAX_LP
- ret
-L(Fast_Unaligned_Load):
- lea __memmove_chk_sse2_unaligned(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
- jz L(SSSE3)
- HAS_CPU_FEATURE (ERMS)
- jz 2f
- lea __memmove_chk_sse2_unaligned_erms(%rip), %RAX_LP
- ret
-L(SSSE3):
- HAS_CPU_FEATURE (SSSE3)
- jz 2f
- lea __memmove_chk_ssse3_back(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jnz 2f
- lea __memmove_chk_ssse3(%rip), %RAX_LP
-2: ret
-END(__memmove_chk)
-# else
-# include "../memmove_chk.S"
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
deleted file mode 100644
index b8b2b28094..0000000000
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Multiple versions of mempcpy
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in lib and for
- DSO. In static binaries we need mempcpy before the initialization
- happened. */
-#if defined SHARED && IS_IN (libc)
- .text
-ENTRY(__mempcpy)
- .type __mempcpy, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- lea __mempcpy_erms(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Prefer_ERMS)
- jnz 2f
- HAS_ARCH_FEATURE (Prefer_No_AVX512)
- jnz 1f
- HAS_ARCH_FEATURE (AVX512F_Usable)
- jz 1f
- lea __mempcpy_avx512_no_vzeroupper(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
- jnz 2f
- lea __mempcpy_avx512_unaligned_erms(%rip), %RAX_LP
- HAS_CPU_FEATURE (ERMS)
- jnz 2f
- lea __mempcpy_avx512_unaligned(%rip), %RAX_LP
- ret
-1: lea __mempcpy_avx_unaligned(%rip), %RAX_LP
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz L(Fast_Unaligned_Load)
- HAS_CPU_FEATURE (ERMS)
- jz 2f
- lea __mempcpy_avx_unaligned_erms(%rip), %RAX_LP
- ret
-L(Fast_Unaligned_Load):
- lea __mempcpy_sse2_unaligned(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
- jz L(SSSE3)
- HAS_CPU_FEATURE (ERMS)
- jz 2f
- lea __mempcpy_sse2_unaligned_erms(%rip), %RAX_LP
- ret
-L(SSSE3):
- HAS_CPU_FEATURE (SSSE3)
- jz 2f
- lea __mempcpy_ssse3_back(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jnz 2f
- lea __mempcpy_ssse3(%rip), %RAX_LP
-2: ret
-END(__mempcpy)
-
-weak_alias (__mempcpy, mempcpy)
-#endif
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S
deleted file mode 100644
index 072b22c49f..0000000000
--- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Multiple versions of __mempcpy_chk
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in lib and for
- DSO. There are no multiarch mempcpy functions for static binaries.
- */
-#if IS_IN (libc)
-# ifdef SHARED
- .text
-ENTRY(__mempcpy_chk)
- .type __mempcpy_chk, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- HAS_ARCH_FEATURE (Prefer_No_AVX512)
- jnz 1f
- HAS_ARCH_FEATURE (AVX512F_Usable)
- jz 1f
- lea __mempcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
- jnz 2f
- lea __mempcpy_chk_avx512_unaligned_erms(%rip), %RAX_LP
- HAS_CPU_FEATURE (ERMS)
- jnz 2f
- lea __mempcpy_chk_avx512_unaligned(%rip), %RAX_LP
- ret
-1: lea __mempcpy_chk_avx_unaligned(%rip), %RAX_LP
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz L(Fast_Unaligned_Load)
- HAS_CPU_FEATURE (ERMS)
- jz 2f
- lea __mempcpy_chk_avx_unaligned_erms(%rip), %RAX_LP
- ret
-L(Fast_Unaligned_Load):
- lea __mempcpy_chk_sse2_unaligned(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
- jz L(SSSE3)
- HAS_CPU_FEATURE (ERMS)
- jz 2f
- lea __mempcpy_chk_sse2_unaligned_erms(%rip), %RAX_LP
- ret
-L(SSSE3):
- HAS_CPU_FEATURE (SSSE3)
- jz 2f
- lea __mempcpy_chk_ssse3_back(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jnz 2f
- lea __mempcpy_chk_ssse3(%rip), %RAX_LP
-2: ret
-END(__mempcpy_chk)
-# else
-# include "../mempcpy_chk.S"
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
deleted file mode 100644
index 7ab3d89849..0000000000
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ /dev/null
@@ -1,22 +0,0 @@
-#if IS_IN (libc)
-# define VEC_SIZE 32
-# define VEC(i) ymm##i
-# define VMOVU vmovdqu
-# define VMOVA vmovdqa
-
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- vmovd d, %xmm0; \
- movq r, %rax; \
- vpbroadcastb %xmm0, %ymm0
-
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- vmovd d, %xmm0; \
- movq r, %rax; \
- vpbroadcastd %xmm0, %ymm0
-
-# define SECTION(p) p##.avx
-# define MEMSET_SYMBOL(p,s) p##_avx2_##s
-# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
-
-# include "memset-vec-unaligned-erms.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
deleted file mode 100644
index 1f66602398..0000000000
--- a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
+++ /dev/null
@@ -1,194 +0,0 @@
-/* memset optimized with AVX512 for KNL hardware.
- Copyright (C) 2015-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-#ifndef MEMSET
-# define MEMSET __memset_avx512_no_vzeroupper
-# define MEMSET_CHK __memset_chk_avx512_no_vzeroupper
-#endif
-
- .section .text.avx512,"ax",@progbits
-#if defined PIC
-ENTRY (MEMSET_CHK)
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMSET_CHK)
-#endif
-
-ENTRY (MEMSET)
- vpxor %xmm0, %xmm0, %xmm0
- vmovd %esi, %xmm1
- lea (%rdi, %rdx), %rsi
- mov %rdi, %rax
- vpshufb %xmm0, %xmm1, %xmm0
- cmp $16, %rdx
- jb L(less_16bytes)
- cmp $512, %rdx
- vbroadcastss %xmm0, %zmm2
- ja L(512bytesormore)
- cmp $256, %rdx
- jb L(less_256bytes)
- vmovups %zmm2, (%rdi)
- vmovups %zmm2, 0x40(%rdi)
- vmovups %zmm2, 0x80(%rdi)
- vmovups %zmm2, 0xC0(%rdi)
- vmovups %zmm2, -0x100(%rsi)
- vmovups %zmm2, -0xC0(%rsi)
- vmovups %zmm2, -0x80(%rsi)
- vmovups %zmm2, -0x40(%rsi)
- ret
-
-L(less_256bytes):
- cmp $128, %dl
- jb L(less_128bytes)
- vmovups %zmm2, (%rdi)
- vmovups %zmm2, 0x40(%rdi)
- vmovups %zmm2, -0x80(%rsi)
- vmovups %zmm2, -0x40(%rsi)
- ret
-
-L(less_128bytes):
- cmp $64, %dl
- jb L(less_64bytes)
- vmovups %zmm2, (%rdi)
- vmovups %zmm2, -0x40(%rsi)
- ret
-
-L(less_64bytes):
- cmp $32, %dl
- jb L(less_32bytes)
- vmovdqu %ymm2, (%rdi)
- vmovdqu %ymm2, -0x20(%rsi)
- ret
-
-L(less_32bytes):
- vmovdqu %xmm0, (%rdi)
- vmovdqu %xmm0, -0x10(%rsi)
- ret
-
-L(less_16bytes):
- cmp $8, %dl
- jb L(less_8bytes)
- vmovq %xmm0, (%rdi)
- vmovq %xmm0, -0x08(%rsi)
- ret
-
-L(less_8bytes):
- vmovd %xmm0, %ecx
- cmp $4, %dl
- jb L(less_4bytes)
- mov %ecx, (%rdi)
- mov %ecx, -0x04(%rsi)
- ret
-
-L(less_4bytes):
- cmp $2, %dl
- jb L(less_2bytes)
- mov %cx, (%rdi)
- mov %cx, -0x02(%rsi)
- ret
-
-L(less_2bytes):
- cmp $1, %dl
- jb L(less_1bytes)
- mov %cl, (%rdi)
-L(less_1bytes):
- ret
-
-L(512bytesormore):
- mov __x86_shared_cache_size_half(%rip), %rcx
- cmp %rcx, %rdx
- ja L(preloop_large)
- cmp $1024, %rdx
- ja L(1024bytesormore)
-
- vmovups %zmm2, (%rdi)
- vmovups %zmm2, 0x40(%rdi)
- vmovups %zmm2, 0x80(%rdi)
- vmovups %zmm2, 0xC0(%rdi)
- vmovups %zmm2, 0x100(%rdi)
- vmovups %zmm2, 0x140(%rdi)
- vmovups %zmm2, 0x180(%rdi)
- vmovups %zmm2, 0x1C0(%rdi)
- vmovups %zmm2, -0x200(%rsi)
- vmovups %zmm2, -0x1C0(%rsi)
- vmovups %zmm2, -0x180(%rsi)
- vmovups %zmm2, -0x140(%rsi)
- vmovups %zmm2, -0x100(%rsi)
- vmovups %zmm2, -0xC0(%rsi)
- vmovups %zmm2, -0x80(%rsi)
- vmovups %zmm2, -0x40(%rsi)
- ret
-
-/* Align on 64 and loop with aligned stores. */
-L(1024bytesormore):
- sub $0x100, %rsi
- vmovups %zmm2, (%rax)
- and $-0x40, %rdi
- add $0x40, %rdi
-
-L(gobble_256bytes_loop):
- vmovaps %zmm2, (%rdi)
- vmovaps %zmm2, 0x40(%rdi)
- vmovaps %zmm2, 0x80(%rdi)
- vmovaps %zmm2, 0xC0(%rdi)
- add $0x100, %rdi
- cmp %rsi, %rdi
- jb L(gobble_256bytes_loop)
- vmovups %zmm2, (%rsi)
- vmovups %zmm2, 0x40(%rsi)
- vmovups %zmm2, 0x80(%rsi)
- vmovups %zmm2, 0xC0(%rsi)
- ret
-
-/* Align on 128 and loop with non-temporal stores. */
-L(preloop_large):
- and $-0x80, %rdi
- add $0x80, %rdi
- vmovups %zmm2, (%rax)
- vmovups %zmm2, 0x40(%rax)
- sub $0x200, %rsi
-
-L(gobble_512bytes_nt_loop):
- vmovntdq %zmm2, (%rdi)
- vmovntdq %zmm2, 0x40(%rdi)
- vmovntdq %zmm2, 0x80(%rdi)
- vmovntdq %zmm2, 0xC0(%rdi)
- vmovntdq %zmm2, 0x100(%rdi)
- vmovntdq %zmm2, 0x140(%rdi)
- vmovntdq %zmm2, 0x180(%rdi)
- vmovntdq %zmm2, 0x1C0(%rdi)
- add $0x200, %rdi
- cmp %rsi, %rdi
- jb L(gobble_512bytes_nt_loop)
- sfence
- vmovups %zmm2, (%rsi)
- vmovups %zmm2, 0x40(%rsi)
- vmovups %zmm2, 0x80(%rsi)
- vmovups %zmm2, 0xC0(%rsi)
- vmovups %zmm2, 0x100(%rsi)
- vmovups %zmm2, 0x140(%rsi)
- vmovups %zmm2, 0x180(%rsi)
- vmovups %zmm2, 0x1C0(%rsi)
- ret
-END (MEMSET)
-#endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
deleted file mode 100644
index 0783979ca5..0000000000
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ /dev/null
@@ -1,24 +0,0 @@
-#if IS_IN (libc)
-# define VEC_SIZE 64
-# define VEC(i) zmm##i
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
-
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- vmovd d, %xmm0; \
- movq r, %rax; \
- vpbroadcastb %xmm0, %xmm0; \
- vpbroadcastq %xmm0, %zmm0
-
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- vmovd d, %xmm0; \
- movq r, %rax; \
- vpbroadcastd %xmm0, %xmm0; \
- vpbroadcastq %xmm0, %zmm0
-
-# define SECTION(p) p##.avx512
-# define MEMSET_SYMBOL(p,s) p##_avx512_##s
-# define WMEMSET_SYMBOL(p,s) p##_avx512_##s
-
-# include "memset-vec-unaligned-erms.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
deleted file mode 100644
index 2eb9e3744e..0000000000
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ /dev/null
@@ -1,263 +0,0 @@
-/* memset/bzero with unaligned store and rep stosb
- Copyright (C) 2016-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-/* memset is implemented as:
- 1. Use overlapping store to avoid branch.
- 2. If size is less than VEC, use integer register stores.
- 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
- 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
- 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
- 4 VEC stores and store 4 * VEC at a time until done. */
-
-#include <sysdep.h>
-
-#ifndef MEMSET_CHK_SYMBOL
-# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
-#endif
-
-#ifndef WMEMSET_CHK_SYMBOL
-# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
-#endif
-
-#ifndef VZEROUPPER
-# if VEC_SIZE > 16
-# define VZEROUPPER vzeroupper
-# else
-# define VZEROUPPER
-# endif
-#endif
-
-#ifndef VZEROUPPER_SHORT_RETURN
-# if VEC_SIZE > 16
-# define VZEROUPPER_SHORT_RETURN vzeroupper
-# else
-# define VZEROUPPER_SHORT_RETURN rep
-# endif
-#endif
-
-#ifndef MOVQ
-# if VEC_SIZE > 16
-# define MOVQ vmovq
-# else
-# define MOVQ movq
-# endif
-#endif
-
-/* Threshold to use Enhanced REP STOSB. Since there is overhead to set
- up REP STOSB operation, REP STOSB isn't faster on short data. The
- memset micro benchmark in glibc shows that 2KB is the approximate
- value above which REP STOSB becomes faster on processors with
- Enhanced REP STOSB. Since the stored value is fixed, larger register
- size has minimal impact on threshold. */
-#ifndef REP_STOSB_THRESHOLD
-# define REP_STOSB_THRESHOLD 2048
-#endif
-
-#ifndef SECTION
-# error SECTION is not defined!
-#endif
-
- .section SECTION(.text),"ax",@progbits
-#if VEC_SIZE == 16 && IS_IN (libc)
-ENTRY (__bzero)
- movq %rdi, %rax /* Set return value. */
- movq %rsi, %rdx /* Set n. */
- pxor %xmm0, %xmm0
- jmp L(entry_from_bzero)
-END (__bzero)
-weak_alias (__bzero, bzero)
-#endif
-
-#if IS_IN (libc)
-# if defined SHARED
-ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
-# endif
-
-ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
- shlq $2, %rdx
- WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
- jmp L(entry_from_bzero)
-END (WMEMSET_SYMBOL (__wmemset, unaligned))
-#endif
-
-#if defined SHARED && IS_IN (libc)
-ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
-#endif
-
-ENTRY (MEMSET_SYMBOL (__memset, unaligned))
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
-L(entry_from_bzero):
- cmpq $VEC_SIZE, %rdx
- jb L(less_vec)
- cmpq $(VEC_SIZE * 2), %rdx
- ja L(more_2x_vec)
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
- VMOVU %VEC(0), (%rdi)
- VZEROUPPER
- ret
-#if defined USE_MULTIARCH && IS_IN (libc)
-END (MEMSET_SYMBOL (__memset, unaligned))
-
-# if VEC_SIZE == 16
-/* Only used to measure performance of REP STOSB. */
-ENTRY (__memset_erms)
-# else
-/* Provide a symbol to debugger. */
-ENTRY (MEMSET_SYMBOL (__memset, erms))
-# endif
-L(stosb):
- /* Issue vzeroupper before rep stosb. */
- VZEROUPPER
- movq %rdx, %rcx
- movzbl %sil, %eax
- movq %rdi, %rdx
- rep stosb
- movq %rdx, %rax
- ret
-# if VEC_SIZE == 16
-END (__memset_erms)
-# else
-END (MEMSET_SYMBOL (__memset, erms))
-# endif
-
-# if defined SHARED && IS_IN (libc)
-ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
-# endif
-
-ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
- cmpq $VEC_SIZE, %rdx
- jb L(less_vec)
- cmpq $(VEC_SIZE * 2), %rdx
- ja L(stosb_more_2x_vec)
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
- VMOVU %VEC(0), (%rdi)
- VZEROUPPER
- ret
-
-L(stosb_more_2x_vec):
- cmpq $REP_STOSB_THRESHOLD, %rdx
- ja L(stosb)
-#endif
-L(more_2x_vec):
- cmpq $(VEC_SIZE * 4), %rdx
- ja L(loop_start)
- VMOVU %VEC(0), (%rdi)
- VMOVU %VEC(0), VEC_SIZE(%rdi)
- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
- VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
-L(return):
- VZEROUPPER
- ret
-
-L(loop_start):
- leaq (VEC_SIZE * 4)(%rdi), %rcx
- VMOVU %VEC(0), (%rdi)
- andq $-(VEC_SIZE * 4), %rcx
- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
- VMOVU %VEC(0), VEC_SIZE(%rdi)
- VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
- VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
- VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
- VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
- VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
- addq %rdi, %rdx
- andq $-(VEC_SIZE * 4), %rdx
- cmpq %rdx, %rcx
- je L(return)
-L(loop):
- VMOVA %VEC(0), (%rcx)
- VMOVA %VEC(0), VEC_SIZE(%rcx)
- VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
- VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
- addq $(VEC_SIZE * 4), %rcx
- cmpq %rcx, %rdx
- jne L(loop)
- VZEROUPPER_SHORT_RETURN
- ret
-L(less_vec):
- /* Less than 1 VEC. */
-# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
-# error Unsupported VEC_SIZE!
-# endif
-# if VEC_SIZE > 32
- cmpb $32, %dl
- jae L(between_32_63)
-# endif
-# if VEC_SIZE > 16
- cmpb $16, %dl
- jae L(between_16_31)
-# endif
- MOVQ %xmm0, %rcx
- cmpb $8, %dl
- jae L(between_8_15)
- cmpb $4, %dl
- jae L(between_4_7)
- cmpb $1, %dl
- ja L(between_2_3)
- jb 1f
- movb %cl, (%rdi)
-1:
- VZEROUPPER
- ret
-# if VEC_SIZE > 32
- /* From 32 to 63. No branch when size == 32. */
-L(between_32_63):
- vmovdqu %ymm0, -32(%rdi,%rdx)
- vmovdqu %ymm0, (%rdi)
- VZEROUPPER
- ret
-# endif
-# if VEC_SIZE > 16
- /* From 16 to 31. No branch when size == 16. */
-L(between_16_31):
- vmovdqu %xmm0, -16(%rdi,%rdx)
- vmovdqu %xmm0, (%rdi)
- VZEROUPPER
- ret
-# endif
- /* From 8 to 15. No branch when size == 8. */
-L(between_8_15):
- movq %rcx, -8(%rdi,%rdx)
- movq %rcx, (%rdi)
- VZEROUPPER
- ret
-L(between_4_7):
- /* From 4 to 7. No branch when size == 4. */
- movl %ecx, -4(%rdi,%rdx)
- movl %ecx, (%rdi)
- VZEROUPPER
- ret
-L(between_2_3):
- /* From 2 to 3. No branch when size == 2. */
- movw %cx, -2(%rdi,%rdx)
- movw %cx, (%rdi)
- VZEROUPPER
- ret
-END (MEMSET_SYMBOL (__memset, unaligned_erms))
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
deleted file mode 100644
index 11f27378b0..0000000000
--- a/sysdeps/x86_64/multiarch/memset.S
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Multiple versions of memset
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2014-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <shlib-compat.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in lib. */
-#if IS_IN (libc)
-ENTRY(memset)
- .type memset, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- lea __memset_erms(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Prefer_ERMS)
- jnz 2f
- lea __memset_sse2_unaligned_erms(%rip), %RAX_LP
- HAS_CPU_FEATURE (ERMS)
- jnz 1f
- lea __memset_sse2_unaligned(%rip), %RAX_LP
-1:
- HAS_ARCH_FEATURE (AVX2_Usable)
- jz 2f
- lea __memset_avx2_unaligned_erms(%rip), %RAX_LP
- HAS_CPU_FEATURE (ERMS)
- jnz L(AVX512F)
- lea __memset_avx2_unaligned(%rip), %RAX_LP
-L(AVX512F):
- HAS_ARCH_FEATURE (Prefer_No_AVX512)
- jnz 2f
- HAS_ARCH_FEATURE (AVX512F_Usable)
- jz 2f
- lea __memset_avx512_no_vzeroupper(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
- jnz 2f
- lea __memset_avx512_unaligned_erms(%rip), %RAX_LP
- HAS_CPU_FEATURE (ERMS)
- jnz 2f
- lea __memset_avx512_unaligned(%rip), %RAX_LP
-2: ret
-END(memset)
-#endif
-
-#if IS_IN (libc)
-# define MEMSET_SYMBOL(p,s) p##_sse2_##s
-# define WMEMSET_SYMBOL(p,s) p##_sse2_##s
-
-# ifdef SHARED
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal memset calls through a PLT.
- The speedup we get from using SSE2 instructions is likely eaten away
- by the indirect call in the PLT. */
-# define libc_hidden_builtin_def(name) \
- .globl __GI_memset; __GI_memset = __memset_sse2_unaligned; \
- .globl __GI_wmemset; __GI_wmemset = __wmemset_sse2_unaligned; \
- .globl __GI___wmemset; __GI___wmemset = __wmemset_sse2_unaligned
-# endif
-
-# undef weak_alias
-# define weak_alias(original, alias) \
- .weak bzero; bzero = __bzero
-
-# undef strong_alias
-# define strong_alias(original, alias)
-#endif
-
-#include "../memset.S"
diff --git a/sysdeps/x86_64/multiarch/memset_chk.S b/sysdeps/x86_64/multiarch/memset_chk.S
deleted file mode 100644
index 7e08311cdf..0000000000
--- a/sysdeps/x86_64/multiarch/memset_chk.S
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Multiple versions of memset_chk
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2014-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in lib. */
-#if IS_IN (libc)
-# ifdef SHARED
-ENTRY(__memset_chk)
- .type __memset_chk, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- lea __memset_chk_sse2_unaligned_erms(%rip), %RAX_LP
- HAS_CPU_FEATURE (ERMS)
- jnz 1f
- lea __memset_chk_sse2_unaligned(%rip), %RAX_LP
-1:
- HAS_ARCH_FEATURE (AVX2_Usable)
- jz 2f
- lea __memset_chk_avx2_unaligned_erms(%rip), %RAX_LP
- HAS_CPU_FEATURE (ERMS)
- jnz L(AVX512F)
- lea __memset_chk_avx2_unaligned(%rip), %RAX_LP
-L(AVX512F):
- HAS_ARCH_FEATURE (Prefer_No_AVX512)
- jnz 2f
- HAS_ARCH_FEATURE (AVX512F_Usable)
- jz 2f
- lea __memset_chk_avx512_no_vzeroupper(%rip), %RAX_LP
- HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
- jnz 2f
- lea __memset_chk_avx512_unaligned_erms(%rip), %RAX_LP
- HAS_CPU_FEATURE (ERMS)
- jnz 2f
- lea __memset_chk_avx512_unaligned(%rip), %RAX_LP
-2: ret
-END(__memset_chk)
-
-strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
- .section .gnu.warning.__memset_zero_constant_len_parameter
- .string "memset used with constant zero length parameter; this could be due to transposed parameters"
-# else
-# include "../memset_chk.S"
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/sched_cpucount.c b/sysdeps/x86_64/multiarch/sched_cpucount.c
deleted file mode 100644
index 453f183747..0000000000
--- a/sysdeps/x86_64/multiarch/sched_cpucount.c
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Count bits in CPU set. x86-64 multi-arch version.
- This file is part of the GNU C Library.
- Copyright (C) 2008-2017 Free Software Foundation, Inc.
- Contributed by Ulrich Drepper <drepper@redhat.com>.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sched.h>
-#include "init-arch.h"
-
-#define __sched_cpucount static generic_cpucount
-#include <posix/sched_cpucount.c>
-#undef __sched_cpucount
-
-#define POPCNT(l) \
- ({ __cpu_mask r; \
- asm ("popcnt %1, %0" : "=r" (r) : "0" (l));\
- r; })
-#define __sched_cpucount static popcount_cpucount
-#include <posix/sched_cpucount.c>
-#undef __sched_cpucount
-
-libc_ifunc (__sched_cpucount,
- HAS_CPU_FEATURE (POPCOUNT) ? popcount_cpucount : generic_cpucount);
diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
deleted file mode 100644
index 34231f8b46..0000000000
--- a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STPCPY
-#define STRCPY __stpcpy_sse2_unaligned
-#include "strcpy-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
deleted file mode 100644
index d971c2da38..0000000000
--- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STPCPY
-#define STRCPY __stpcpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/stpcpy.S b/sysdeps/x86_64/multiarch/stpcpy.S
deleted file mode 100644
index ee81ab6ae3..0000000000
--- a/sysdeps/x86_64/multiarch/stpcpy.S
+++ /dev/null
@@ -1,9 +0,0 @@
-/* Multiple versions of stpcpy
- All versions must be listed in ifunc-impl-list.c. */
-#define USE_AS_STPCPY
-#define STRCPY __stpcpy
-#include "strcpy.S"
-
-weak_alias (__stpcpy, stpcpy)
-libc_hidden_def (__stpcpy)
-libc_hidden_builtin_def (stpcpy)
diff --git a/sysdeps/x86_64/multiarch/stpncpy-c.c b/sysdeps/x86_64/multiarch/stpncpy-c.c
deleted file mode 100644
index 2fde77dcab..0000000000
--- a/sysdeps/x86_64/multiarch/stpncpy-c.c
+++ /dev/null
@@ -1,8 +0,0 @@
-#define STPNCPY __stpncpy_sse2
-#ifdef SHARED
-#undef libc_hidden_def
-#define libc_hidden_def(name) \
- __hidden_ver1 (__stpncpy_sse2, __GI___stpncpy, __stpncpy_sse2);
-#endif
-
-#include "stpncpy.c"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S
deleted file mode 100644
index 658520f78f..0000000000
--- a/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY __stpncpy_sse2_unaligned
-#include "strcpy-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
deleted file mode 100644
index 14ed16f6b5..0000000000
--- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY __stpncpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy.S b/sysdeps/x86_64/multiarch/stpncpy.S
deleted file mode 100644
index 2698ca6a8c..0000000000
--- a/sysdeps/x86_64/multiarch/stpncpy.S
+++ /dev/null
@@ -1,8 +0,0 @@
-/* Multiple versions of stpncpy
- All versions must be listed in ifunc-impl-list.c. */
-#define STRCPY __stpncpy
-#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#include "strcpy.S"
-
-weak_alias (__stpncpy, stpncpy)
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S b/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
deleted file mode 100644
index fb2f9ae14a..0000000000
--- a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#define USE_SSSE3 1
-#define USE_AS_STRCASECMP_L
-#define NO_NOLOCALE_ALIAS
-#define STRCMP __strcasecmp_l_ssse3
-#define __strcasecmp __strcasecmp_ssse3
-#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l.S b/sysdeps/x86_64/multiarch/strcasecmp_l.S
deleted file mode 100644
index 49f5b9fd95..0000000000
--- a/sysdeps/x86_64/multiarch/strcasecmp_l.S
+++ /dev/null
@@ -1,8 +0,0 @@
-/* Multiple versions of strcasecmp and strcasecmp_l
- All versions must be listed in ifunc-impl-list.c. */
-#define STRCMP __strcasecmp_l
-#define USE_AS_STRCASECMP_L
-#include "strcmp.S"
-
-weak_alias (__strcasecmp_l, strcasecmp_l)
-libc_hidden_def (strcasecmp_l)
diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
deleted file mode 100644
index d0a8a1518a..0000000000
--- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
+++ /dev/null
@@ -1,279 +0,0 @@
-/* strcat with SSE2
- Copyright (C) 2011-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-# define STRCAT __strcat_sse2_unaligned
-# endif
-
-# define USE_AS_STRCAT
-
-.text
-ENTRY (STRCAT)
- mov %rdi, %r9
-# ifdef USE_AS_STRNCAT
- mov %rdx, %r8
-# endif
-
-/* Inline corresponding strlen file, temporary until new strcpy
- implementation gets merged. */
-
- xor %rax, %rax
- mov %edi, %ecx
- and $0x3f, %ecx
- pxor %xmm0, %xmm0
- cmp $0x30, %ecx
- ja L(next)
- movdqu (%rdi), %xmm1
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit_less16)
- mov %rdi, %rax
- and $-16, %rax
- jmp L(align16_start)
-L(next):
- mov %rdi, %rax
- and $-16, %rax
- pcmpeqb (%rax), %xmm0
- mov $-1, %r10d
- sub %rax, %rcx
- shl %cl, %r10d
- pmovmskb %xmm0, %edx
- and %r10d, %edx
- jnz L(exit)
-
-L(align16_start):
- pxor %xmm0, %xmm0
- pxor %xmm1, %xmm1
- pxor %xmm2, %xmm2
- pxor %xmm3, %xmm3
- pcmpeqb 16(%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit16)
-
- pcmpeqb 32(%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- jnz L(exit32)
-
- pcmpeqb 48(%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- jnz L(exit48)
-
- pcmpeqb 64(%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- jnz L(exit64)
-
- pcmpeqb 80(%rax), %xmm0
- add $64, %rax
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit16)
-
- pcmpeqb 32(%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- jnz L(exit32)
-
- pcmpeqb 48(%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- jnz L(exit48)
-
- pcmpeqb 64(%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- jnz L(exit64)
-
- pcmpeqb 80(%rax), %xmm0
- add $64, %rax
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit16)
-
- pcmpeqb 32(%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- jnz L(exit32)
-
- pcmpeqb 48(%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- jnz L(exit48)
-
- pcmpeqb 64(%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- jnz L(exit64)
-
- pcmpeqb 80(%rax), %xmm0
- add $64, %rax
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit16)
-
- pcmpeqb 32(%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- jnz L(exit32)
-
- pcmpeqb 48(%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- jnz L(exit48)
-
- pcmpeqb 64(%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- jnz L(exit64)
-
- test $0x3f, %rax
- jz L(align64_loop)
-
- pcmpeqb 80(%rax), %xmm0
- add $80, %rax
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit)
-
- test $0x3f, %rax
- jz L(align64_loop)
-
- pcmpeqb 16(%rax), %xmm1
- add $16, %rax
- pmovmskb %xmm1, %edx
- test %edx, %edx
- jnz L(exit)
-
- test $0x3f, %rax
- jz L(align64_loop)
-
- pcmpeqb 16(%rax), %xmm2
- add $16, %rax
- pmovmskb %xmm2, %edx
- test %edx, %edx
- jnz L(exit)
-
- test $0x3f, %rax
- jz L(align64_loop)
-
- pcmpeqb 16(%rax), %xmm3
- add $16, %rax
- pmovmskb %xmm3, %edx
- test %edx, %edx
- jnz L(exit)
-
- add $16, %rax
- .p2align 4
- L(align64_loop):
- movaps (%rax), %xmm4
- pminub 16(%rax), %xmm4
- movaps 32(%rax), %xmm5
- pminub 48(%rax), %xmm5
- add $64, %rax
- pminub %xmm4, %xmm5
- pcmpeqb %xmm0, %xmm5
- pmovmskb %xmm5, %edx
- test %edx, %edx
- jz L(align64_loop)
-
- pcmpeqb -64(%rax), %xmm0
- sub $80, %rax
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit16)
-
- pcmpeqb 32(%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- jnz L(exit32)
-
- pcmpeqb 48(%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- jnz L(exit48)
-
- pcmpeqb 64(%rax), %xmm3
- pmovmskb %xmm3, %edx
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $64, %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit):
- sub %rdi, %rax
-L(exit_less16):
- bsf %rdx, %rdx
- add %rdx, %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit16):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $16, %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit32):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $32, %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit48):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $48, %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit64):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $64, %rax
-
- .p2align 4
-L(StartStrcpyPart):
- lea (%r9, %rax), %rdi
- mov %rsi, %rcx
- mov %r9, %rax /* save result */
-
-# ifdef USE_AS_STRNCAT
- test %r8, %r8
- jz L(ExitZero)
-# define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-sse2-unaligned.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S
deleted file mode 100644
index edd683d778..0000000000
--- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
+++ /dev/null
@@ -1,867 +0,0 @@
-/* strcat with SSSE3
- Copyright (C) 2011-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-# define STRCAT __strcat_ssse3
-# endif
-
-# define USE_AS_STRCAT
-
-.text
-ENTRY (STRCAT)
-# ifdef USE_AS_STRNCAT
- mov %rdx, %r8
-# endif
-
-
-/* Inline corresponding strlen file, temporary until new strcpy
- implementation gets merged. */
-
- xor %eax, %eax
- cmpb $0, (%rdi)
- jz L(exit_tail0)
- cmpb $0, 1(%rdi)
- jz L(exit_tail1)
- cmpb $0, 2(%rdi)
- jz L(exit_tail2)
- cmpb $0, 3(%rdi)
- jz L(exit_tail3)
-
- cmpb $0, 4(%rdi)
- jz L(exit_tail4)
- cmpb $0, 5(%rdi)
- jz L(exit_tail5)
- cmpb $0, 6(%rdi)
- jz L(exit_tail6)
- cmpb $0, 7(%rdi)
- jz L(exit_tail7)
-
- cmpb $0, 8(%rdi)
- jz L(exit_tail8)
- cmpb $0, 9(%rdi)
- jz L(exit_tail9)
- cmpb $0, 10(%rdi)
- jz L(exit_tail10)
- cmpb $0, 11(%rdi)
- jz L(exit_tail11)
-
- cmpb $0, 12(%rdi)
- jz L(exit_tail12)
- cmpb $0, 13(%rdi)
- jz L(exit_tail13)
- cmpb $0, 14(%rdi)
- jz L(exit_tail14)
- cmpb $0, 15(%rdi)
- jz L(exit_tail15)
- pxor %xmm0, %xmm0
- lea 16(%rdi), %rcx
- lea 16(%rdi), %rax
- and $-16, %rax
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- pxor %xmm1, %xmm1
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- pxor %xmm2, %xmm2
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- pxor %xmm3, %xmm3
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- and $-0x40, %rax
-
- .p2align 4
-L(aligned_64):
- pcmpeqb (%rax), %xmm0
- pcmpeqb 16(%rax), %xmm1
- pcmpeqb 32(%rax), %xmm2
- pcmpeqb 48(%rax), %xmm3
- pmovmskb %xmm0, %edx
- pmovmskb %xmm1, %r11d
- pmovmskb %xmm2, %r10d
- pmovmskb %xmm3, %r9d
- or %edx, %r9d
- or %r11d, %r9d
- or %r10d, %r9d
- lea 64(%rax), %rax
- jz L(aligned_64)
-
- test %edx, %edx
- jnz L(aligned_64_exit_16)
- test %r11d, %r11d
- jnz L(aligned_64_exit_32)
- test %r10d, %r10d
- jnz L(aligned_64_exit_48)
-
-L(aligned_64_exit_64):
- pmovmskb %xmm3, %edx
- jmp L(exit)
-
-L(aligned_64_exit_48):
- lea -16(%rax), %rax
- mov %r10d, %edx
- jmp L(exit)
-
-L(aligned_64_exit_32):
- lea -32(%rax), %rax
- mov %r11d, %edx
- jmp L(exit)
-
-L(aligned_64_exit_16):
- lea -48(%rax), %rax
-
-L(exit):
- sub %rcx, %rax
- test %dl, %dl
- jz L(exit_high)
- test $0x01, %dl
- jnz L(exit_tail0)
-
- test $0x02, %dl
- jnz L(exit_tail1)
-
- test $0x04, %dl
- jnz L(exit_tail2)
-
- test $0x08, %dl
- jnz L(exit_tail3)
-
- test $0x10, %dl
- jnz L(exit_tail4)
-
- test $0x20, %dl
- jnz L(exit_tail5)
-
- test $0x40, %dl
- jnz L(exit_tail6)
- add $7, %eax
-L(exit_tail0):
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_high):
- add $8, %eax
- test $0x01, %dh
- jnz L(exit_tail0)
-
- test $0x02, %dh
- jnz L(exit_tail1)
-
- test $0x04, %dh
- jnz L(exit_tail2)
-
- test $0x08, %dh
- jnz L(exit_tail3)
-
- test $0x10, %dh
- jnz L(exit_tail4)
-
- test $0x20, %dh
- jnz L(exit_tail5)
-
- test $0x40, %dh
- jnz L(exit_tail6)
- add $7, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail1):
- add $1, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail2):
- add $2, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail3):
- add $3, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail4):
- add $4, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail5):
- add $5, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail6):
- add $6, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail7):
- add $7, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail8):
- add $8, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail9):
- add $9, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail10):
- add $10, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail11):
- add $11, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail12):
- add $12, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail13):
- add $13, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail14):
- add $14, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail15):
- add $15, %eax
-
- .p2align 4
-L(StartStrcpyPart):
- mov %rsi, %rcx
- lea (%rdi, %rax), %rdx
-# ifdef USE_AS_STRNCAT
- test %r8, %r8
- jz L(StrncatExit0)
- cmp $8, %r8
- jbe L(StrncatExit8Bytes)
-# endif
- cmpb $0, (%rcx)
- jz L(Exit1)
- cmpb $0, 1(%rcx)
- jz L(Exit2)
- cmpb $0, 2(%rcx)
- jz L(Exit3)
- cmpb $0, 3(%rcx)
- jz L(Exit4)
- cmpb $0, 4(%rcx)
- jz L(Exit5)
- cmpb $0, 5(%rcx)
- jz L(Exit6)
- cmpb $0, 6(%rcx)
- jz L(Exit7)
- cmpb $0, 7(%rcx)
- jz L(Exit8)
- cmpb $0, 8(%rcx)
- jz L(Exit9)
-# ifdef USE_AS_STRNCAT
- cmp $16, %r8
- jb L(StrncatExit15Bytes)
-# endif
- cmpb $0, 9(%rcx)
- jz L(Exit10)
- cmpb $0, 10(%rcx)
- jz L(Exit11)
- cmpb $0, 11(%rcx)
- jz L(Exit12)
- cmpb $0, 12(%rcx)
- jz L(Exit13)
- cmpb $0, 13(%rcx)
- jz L(Exit14)
- cmpb $0, 14(%rcx)
- jz L(Exit15)
- cmpb $0, 15(%rcx)
- jz L(Exit16)
-# ifdef USE_AS_STRNCAT
- cmp $16, %r8
- je L(StrncatExit16)
-# define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-ssse3.S"
-
- .p2align 4
-L(CopyFrom1To16Bytes):
- add %rsi, %rdx
- add %rsi, %rcx
-
- test %al, %al
- jz L(ExitHigh)
- test $0x01, %al
- jnz L(Exit1)
- test $0x02, %al
- jnz L(Exit2)
- test $0x04, %al
- jnz L(Exit3)
- test $0x08, %al
- jnz L(Exit4)
- test $0x10, %al
- jnz L(Exit5)
- test $0x20, %al
- jnz L(Exit6)
- test $0x40, %al
- jnz L(Exit7)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(ExitHigh):
- test $0x01, %ah
- jnz L(Exit9)
- test $0x02, %ah
- jnz L(Exit10)
- test $0x04, %ah
- jnz L(Exit11)
- test $0x08, %ah
- jnz L(Exit12)
- test $0x10, %ah
- jnz L(Exit13)
- test $0x20, %ah
- jnz L(Exit14)
- test $0x40, %ah
- jnz L(Exit15)
- movlpd (%rcx), %xmm0
- movlpd 8(%rcx), %xmm1
- movlpd %xmm0, (%rdx)
- movlpd %xmm1, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit1):
- xor %ah, %ah
- movb %ah, 1(%rdx)
-L(Exit1):
- movb (%rcx), %al
- movb %al, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit2):
- xor %ah, %ah
- movb %ah, 2(%rdx)
-L(Exit2):
- movw (%rcx), %ax
- movw %ax, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit3):
- xor %ah, %ah
- movb %ah, 3(%rdx)
-L(Exit3):
- movw (%rcx), %ax
- movw %ax, (%rdx)
- movb 2(%rcx), %al
- movb %al, 2(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit4):
- xor %ah, %ah
- movb %ah, 4(%rdx)
-L(Exit4):
- mov (%rcx), %eax
- mov %eax, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit5):
- xor %ah, %ah
- movb %ah, 5(%rdx)
-L(Exit5):
- mov (%rcx), %eax
- mov %eax, (%rdx)
- movb 4(%rcx), %al
- movb %al, 4(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit6):
- xor %ah, %ah
- movb %ah, 6(%rdx)
-L(Exit6):
- mov (%rcx), %eax
- mov %eax, (%rdx)
- movw 4(%rcx), %ax
- movw %ax, 4(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit7):
- xor %ah, %ah
- movb %ah, 7(%rdx)
-L(Exit7):
- mov (%rcx), %eax
- mov %eax, (%rdx)
- mov 3(%rcx), %eax
- mov %eax, 3(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit8):
- xor %ah, %ah
- movb %ah, 8(%rdx)
-L(Exit8):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit9):
- xor %ah, %ah
- movb %ah, 9(%rdx)
-L(Exit9):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movb 8(%rcx), %al
- movb %al, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit10):
- xor %ah, %ah
- movb %ah, 10(%rdx)
-L(Exit10):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movw 8(%rcx), %ax
- movw %ax, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit11):
- xor %ah, %ah
- movb %ah, 11(%rdx)
-L(Exit11):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- mov 7(%rcx), %eax
- mov %eax, 7(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit12):
- xor %ah, %ah
- movb %ah, 12(%rdx)
-L(Exit12):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- mov 8(%rcx), %eax
- mov %eax, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit13):
- xor %ah, %ah
- movb %ah, 13(%rdx)
-L(Exit13):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 5(%rcx), %xmm1
- movlpd %xmm1, 5(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit14):
- xor %ah, %ah
- movb %ah, 14(%rdx)
-L(Exit14):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 6(%rcx), %xmm1
- movlpd %xmm1, 6(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit15):
- xor %ah, %ah
- movb %ah, 15(%rdx)
-L(Exit15):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 7(%rcx), %xmm1
- movlpd %xmm1, 7(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit16):
- xor %ah, %ah
- movb %ah, 16(%rdx)
-L(Exit16):
- movlpd (%rcx), %xmm0
- movlpd 8(%rcx), %xmm1
- movlpd %xmm0, (%rdx)
- movlpd %xmm1, 8(%rdx)
- mov %rdi, %rax
- ret
-
-# ifdef USE_AS_STRNCPY
-
- .p2align 4
-L(CopyFrom1To16BytesCase2):
- add $16, %r8
- add %rsi, %rcx
- lea (%rsi, %rdx), %rsi
- lea -9(%r8), %rdx
- and $1<<7, %dh
- or %al, %dh
- test %dh, %dh
- lea (%rsi), %rdx
- jz L(ExitHighCase2)
-
- test $0x01, %al
- jnz L(Exit1)
- cmp $1, %r8
- je L(StrncatExit1)
- test $0x02, %al
- jnz L(Exit2)
- cmp $2, %r8
- je L(StrncatExit2)
- test $0x04, %al
- jnz L(Exit3)
- cmp $3, %r8
- je L(StrncatExit3)
- test $0x08, %al
- jnz L(Exit4)
- cmp $4, %r8
- je L(StrncatExit4)
- test $0x10, %al
- jnz L(Exit5)
- cmp $5, %r8
- je L(StrncatExit5)
- test $0x20, %al
- jnz L(Exit6)
- cmp $6, %r8
- je L(StrncatExit6)
- test $0x40, %al
- jnz L(Exit7)
- cmp $7, %r8
- je L(StrncatExit7)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- lea 7(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
- xor %cl, %cl
- movb %cl, (%rax)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(ExitHighCase2):
- test $0x01, %ah
- jnz L(Exit9)
- cmp $9, %r8
- je L(StrncatExit9)
- test $0x02, %ah
- jnz L(Exit10)
- cmp $10, %r8
- je L(StrncatExit10)
- test $0x04, %ah
- jnz L(Exit11)
- cmp $11, %r8
- je L(StrncatExit11)
- test $0x8, %ah
- jnz L(Exit12)
- cmp $12, %r8
- je L(StrncatExit12)
- test $0x10, %ah
- jnz L(Exit13)
- cmp $13, %r8
- je L(StrncatExit13)
- test $0x20, %ah
- jnz L(Exit14)
- cmp $14, %r8
- je L(StrncatExit14)
- test $0x40, %ah
- jnz L(Exit15)
- cmp $15, %r8
- je L(StrncatExit15)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 8(%rcx), %xmm1
- movlpd %xmm1, 8(%rdx)
- mov %rdi, %rax
- ret
-
-L(CopyFrom1To16BytesCase2OrCase3):
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
-
- .p2align 4
-L(CopyFrom1To16BytesCase3):
- add $16, %r8
- add %rsi, %rdx
- add %rsi, %rcx
-
- cmp $8, %r8
- ja L(ExitHighCase3)
- cmp $1, %r8
- je L(StrncatExit1)
- cmp $2, %r8
- je L(StrncatExit2)
- cmp $3, %r8
- je L(StrncatExit3)
- cmp $4, %r8
- je L(StrncatExit4)
- cmp $5, %r8
- je L(StrncatExit5)
- cmp $6, %r8
- je L(StrncatExit6)
- cmp $7, %r8
- je L(StrncatExit7)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- xor %ah, %ah
- movb %ah, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(ExitHighCase3):
- cmp $9, %r8
- je L(StrncatExit9)
- cmp $10, %r8
- je L(StrncatExit10)
- cmp $11, %r8
- je L(StrncatExit11)
- cmp $12, %r8
- je L(StrncatExit12)
- cmp $13, %r8
- je L(StrncatExit13)
- cmp $14, %r8
- je L(StrncatExit14)
- cmp $15, %r8
- je L(StrncatExit15)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 8(%rcx), %xmm1
- movlpd %xmm1, 8(%rdx)
- xor %ah, %ah
- movb %ah, 16(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit0):
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit15Bytes):
- cmp $9, %r8
- je L(StrncatExit9)
- cmpb $0, 9(%rcx)
- jz L(Exit10)
- cmp $10, %r8
- je L(StrncatExit10)
- cmpb $0, 10(%rcx)
- jz L(Exit11)
- cmp $11, %r8
- je L(StrncatExit11)
- cmpb $0, 11(%rcx)
- jz L(Exit12)
- cmp $12, %r8
- je L(StrncatExit12)
- cmpb $0, 12(%rcx)
- jz L(Exit13)
- cmp $13, %r8
- je L(StrncatExit13)
- cmpb $0, 13(%rcx)
- jz L(Exit14)
- cmp $14, %r8
- je L(StrncatExit14)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 7(%rcx), %xmm1
- movlpd %xmm1, 7(%rdx)
- lea 14(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
- xor %cl, %cl
- movb %cl, (%rax)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit8Bytes):
- cmpb $0, (%rcx)
- jz L(Exit1)
- cmp $1, %r8
- je L(StrncatExit1)
- cmpb $0, 1(%rcx)
- jz L(Exit2)
- cmp $2, %r8
- je L(StrncatExit2)
- cmpb $0, 2(%rcx)
- jz L(Exit3)
- cmp $3, %r8
- je L(StrncatExit3)
- cmpb $0, 3(%rcx)
- jz L(Exit4)
- cmp $4, %r8
- je L(StrncatExit4)
- cmpb $0, 4(%rcx)
- jz L(Exit5)
- cmp $5, %r8
- je L(StrncatExit5)
- cmpb $0, 5(%rcx)
- jz L(Exit6)
- cmp $6, %r8
- je L(StrncatExit6)
- cmpb $0, 6(%rcx)
- jz L(Exit7)
- cmp $7, %r8
- je L(StrncatExit7)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- lea 7(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
- xor %cl, %cl
- movb %cl, (%rax)
- mov %rdi, %rax
- ret
-
-# endif
-END (STRCAT)
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcat.S b/sysdeps/x86_64/multiarch/strcat.S
deleted file mode 100644
index 0e0e5dda9c..0000000000
--- a/sysdeps/x86_64/multiarch/strcat.S
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Multiple versions of strcat
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2009-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-#ifndef USE_AS_STRNCAT
-# ifndef STRCAT
-# define STRCAT strcat
-# endif
-#endif
-
-#ifdef USE_AS_STRNCAT
-# define STRCAT_SSSE3 __strncat_ssse3
-# define STRCAT_SSE2 __strncat_sse2
-# define STRCAT_SSE2_UNALIGNED __strncat_sse2_unaligned
-# define __GI_STRCAT __GI_strncat
-# define __GI___STRCAT __GI___strncat
-#else
-# define STRCAT_SSSE3 __strcat_ssse3
-# define STRCAT_SSE2 __strcat_sse2
-# define STRCAT_SSE2_UNALIGNED __strcat_sse2_unaligned
-# define __GI_STRCAT __GI_strcat
-# define __GI___STRCAT __GI___strcat
-#endif
-
-
-/* Define multiple versions only for the definition in libc. */
-#if IS_IN (libc)
- .text
-ENTRY(STRCAT)
- .type STRCAT, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- leaq STRCAT_SSE2_UNALIGNED(%rip), %rax
- HAS_ARCH_FEATURE (Fast_Unaligned_Load)
- jnz 2f
- leaq STRCAT_SSE2(%rip), %rax
- HAS_CPU_FEATURE (SSSE3)
- jz 2f
- leaq STRCAT_SSSE3(%rip), %rax
-2: ret
-END(STRCAT)
-
-# undef ENTRY
-# define ENTRY(name) \
- .type STRCAT_SSE2, @function; \
- .align 16; \
- .globl STRCAT_SSE2; \
- .hidden STRCAT_SSE2; \
- STRCAT_SSE2: cfi_startproc; \
- CALL_MCOUNT
-# undef END
-# define END(name) \
- cfi_endproc; .size STRCAT_SSE2, .-STRCAT_SSE2
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal strcat calls through a PLT.
- The speedup we get from using SSSE3 instruction is likely eaten away
- by the indirect call in the PLT. */
-# define libc_hidden_builtin_def(name) \
- .globl __GI_STRCAT; __GI_STRCAT = STRCAT_SSE2
-# undef libc_hidden_def
-# define libc_hidden_def(name) \
- .globl __GI___STRCAT; __GI___STRCAT = STRCAT_SSE2
-#endif
-
-#ifndef USE_AS_STRNCAT
-# include "../strcat.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S
deleted file mode 100644
index cbbd0b33d3..0000000000
--- a/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S
+++ /dev/null
@@ -1,280 +0,0 @@
-/* strchr with SSE2 without bsf
- Copyright (C) 2011-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-# include "asm-syntax.h"
-
- atom_text_section
-ENTRY (__strchr_sse2_no_bsf)
- movd %esi, %xmm1
- movq %rdi, %rcx
- punpcklbw %xmm1, %xmm1
- andq $~15, %rdi
- pxor %xmm2, %xmm2
- punpcklbw %xmm1, %xmm1
- orl $0xffffffff, %esi
- movdqa (%rdi), %xmm0
- pshufd $0, %xmm1, %xmm1
- subq %rdi, %rcx
- movdqa %xmm0, %xmm3
- leaq 16(%rdi), %rdi
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm3
- shl %cl, %esi
- pmovmskb %xmm0, %eax
- pmovmskb %xmm3, %edx
- andl %esi, %eax
- andl %esi, %edx
- test %eax, %eax
- jnz L(matches)
- test %edx, %edx
- jnz L(return_null)
-
-L(loop):
- movdqa (%rdi), %xmm0
- leaq 16(%rdi), %rdi
- movdqa %xmm0, %xmm3
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm3
- pmovmskb %xmm0, %eax
- pmovmskb %xmm3, %edx
- or %eax, %edx
- jz L(loop)
-
- pmovmskb %xmm3, %edx
- test %eax, %eax
- jnz L(matches)
-
-/* Return NULL. */
- .p2align 4
-L(return_null):
- xor %rax, %rax
- ret
-
-L(matches):
- /* There is a match. First find where NULL is. */
- leaq -16(%rdi), %rdi
- test %edx, %edx
- jz L(match_case1)
-
- .p2align 4
-L(match_case2):
- test %al, %al
- jz L(match_high_case2)
-
- mov %al, %cl
- and $15, %cl
- jnz L(match_case2_4)
-
- mov %dl, %ch
- and $15, %ch
- jnz L(return_null)
-
- test $0x10, %al
- jnz L(Exit5)
- test $0x10, %dl
- jnz L(return_null)
- test $0x20, %al
- jnz L(Exit6)
- test $0x20, %dl
- jnz L(return_null)
- test $0x40, %al
- jnz L(Exit7)
- test $0x40, %dl
- jnz L(return_null)
- lea 7(%rdi), %rax
- ret
-
- .p2align 4
-L(match_case2_4):
- test $0x01, %al
- jnz L(Exit1)
- test $0x01, %dl
- jnz L(return_null)
- test $0x02, %al
- jnz L(Exit2)
- test $0x02, %dl
- jnz L(return_null)
- test $0x04, %al
- jnz L(Exit3)
- test $0x04, %dl
- jnz L(return_null)
- lea 3(%rdi), %rax
- ret
-
- .p2align 4
-L(match_high_case2):
- test %dl, %dl
- jnz L(return_null)
-
- mov %ah, %cl
- and $15, %cl
- jnz L(match_case2_12)
-
- mov %dh, %ch
- and $15, %ch
- jnz L(return_null)
-
- test $0x10, %ah
- jnz L(Exit13)
- test $0x10, %dh
- jnz L(return_null)
- test $0x20, %ah
- jnz L(Exit14)
- test $0x20, %dh
- jnz L(return_null)
- test $0x40, %ah
- jnz L(Exit15)
- test $0x40, %dh
- jnz L(return_null)
- lea 15(%rdi), %rax
- ret
-
- .p2align 4
-L(match_case2_12):
- test $0x01, %ah
- jnz L(Exit9)
- test $0x01, %dh
- jnz L(return_null)
- test $0x02, %ah
- jnz L(Exit10)
- test $0x02, %dh
- jnz L(return_null)
- test $0x04, %ah
- jnz L(Exit11)
- test $0x04, %dh
- jnz L(return_null)
- lea 11(%rdi), %rax
- ret
-
- .p2align 4
-L(match_case1):
- test %al, %al
- jz L(match_high_case1)
-
- test $0x01, %al
- jnz L(Exit1)
- test $0x02, %al
- jnz L(Exit2)
- test $0x04, %al
- jnz L(Exit3)
- test $0x08, %al
- jnz L(Exit4)
- test $0x10, %al
- jnz L(Exit5)
- test $0x20, %al
- jnz L(Exit6)
- test $0x40, %al
- jnz L(Exit7)
- lea 7(%rdi), %rax
- ret
-
- .p2align 4
-L(match_high_case1):
- test $0x01, %ah
- jnz L(Exit9)
- test $0x02, %ah
- jnz L(Exit10)
- test $0x04, %ah
- jnz L(Exit11)
- test $0x08, %ah
- jnz L(Exit12)
- test $0x10, %ah
- jnz L(Exit13)
- test $0x20, %ah
- jnz L(Exit14)
- test $0x40, %ah
- jnz L(Exit15)
- lea 15(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit1):
- lea (%rdi), %rax
- ret
-
- .p2align 4
-L(Exit2):
- lea 1(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit3):
- lea 2(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit4):
- lea 3(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit5):
- lea 4(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit6):
- lea 5(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit7):
- lea 6(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit9):
- lea 8(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit10):
- lea 9(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit11):
- lea 10(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit12):
- lea 11(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit13):
- lea 12(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit14):
- lea 13(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit15):
- lea 14(%rdi), %rax
- ret
-
-END (__strchr_sse2_no_bsf)
-#endif
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
deleted file mode 100644
index c9f54ca2e2..0000000000
--- a/sysdeps/x86_64/multiarch/strchr.S
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Multiple versions of strchr
- Copyright (C) 2009-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-
-/* Define multiple versions only for the definition in libc. */
-#if IS_IN (libc)
- .text
-ENTRY(strchr)
- .type strchr, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- leaq __strchr_sse2(%rip), %rax
-2: HAS_ARCH_FEATURE (Slow_BSF)
- jz 3f
- leaq __strchr_sse2_no_bsf(%rip), %rax
-3: ret
-END(strchr)
-
-
-
-# undef ENTRY
-# define ENTRY(name) \
- .type __strchr_sse2, @function; \
- .align 16; \
- .globl __strchr_sse2; \
- .hidden __strchr_sse2; \
- __strchr_sse2: cfi_startproc; \
- CALL_MCOUNT
-# undef END
-# define END(name) \
- cfi_endproc; .size __strchr_sse2, .-__strchr_sse2
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal strchr calls through a PLT.
- The speedup we get from using SSE4.2 instruction is likely eaten away
- by the indirect call in the PLT. */
-# define libc_hidden_builtin_def(name) \
- .globl __GI_strchr; __GI_strchr = __strchr_sse2
-#endif
-
-#include "../strchr.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
deleted file mode 100644
index b0992dce39..0000000000
--- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
+++ /dev/null
@@ -1,213 +0,0 @@
-/* strcmp with unaligned loads
- Copyright (C) 2013-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-#include "sysdep.h"
-
-ENTRY ( __strcmp_sse2_unaligned)
- movl %edi, %eax
- xorl %edx, %edx
- pxor %xmm7, %xmm7
- orl %esi, %eax
- andl $4095, %eax
- cmpl $4032, %eax
- jg L(cross_page)
- movdqu (%rdi), %xmm1
- movdqu (%rsi), %xmm0
- pcmpeqb %xmm1, %xmm0
- pminub %xmm1, %xmm0
- pxor %xmm1, %xmm1
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %eax
- testq %rax, %rax
- je L(next_48_bytes)
-L(return):
- bsfq %rax, %rdx
- movzbl (%rdi, %rdx), %eax
- movzbl (%rsi, %rdx), %edx
- subl %edx, %eax
- ret
-
- .p2align 4
-L(next_48_bytes):
- movdqu 16(%rdi), %xmm6
- movdqu 16(%rsi), %xmm3
- movdqu 32(%rdi), %xmm5
- pcmpeqb %xmm6, %xmm3
- movdqu 32(%rsi), %xmm2
- pminub %xmm6, %xmm3
- pcmpeqb %xmm1, %xmm3
- movdqu 48(%rdi), %xmm4
- pcmpeqb %xmm5, %xmm2
- pmovmskb %xmm3, %edx
- movdqu 48(%rsi), %xmm0
- pminub %xmm5, %xmm2
- pcmpeqb %xmm1, %xmm2
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm2, %eax
- salq $16, %rdx
- pminub %xmm4, %xmm0
- pcmpeqb %xmm1, %xmm0
- salq $32, %rax
- orq %rdx, %rax
- pmovmskb %xmm0, %ecx
- movq %rcx, %rdx
- salq $48, %rdx
- orq %rdx, %rax
- jne L(return)
-L(main_loop_header):
- leaq 64(%rdi), %rdx
- movl $4096, %ecx
- pxor %xmm9, %xmm9
- andq $-64, %rdx
- subq %rdi, %rdx
- leaq (%rdi, %rdx), %rax
- addq %rsi, %rdx
- movq %rdx, %rsi
- andl $4095, %esi
- subq %rsi, %rcx
- shrq $6, %rcx
- movq %rcx, %rsi
- jmp L(loop_start)
-
- .p2align 4
-L(loop):
- addq $64, %rax
- addq $64, %rdx
-L(loop_start):
- testq %rsi, %rsi
- leaq -1(%rsi), %rsi
- je L(loop_cross_page)
-L(back_to_loop):
- movdqu (%rdx), %xmm0
- movdqu 16(%rdx), %xmm1
- movdqa (%rax), %xmm2
- movdqa 16(%rax), %xmm3
- pcmpeqb %xmm2, %xmm0
- movdqu 32(%rdx), %xmm5
- pcmpeqb %xmm3, %xmm1
- pminub %xmm2, %xmm0
- movdqu 48(%rdx), %xmm6
- pminub %xmm3, %xmm1
- movdqa 32(%rax), %xmm2
- pminub %xmm1, %xmm0
- movdqa 48(%rax), %xmm3
- pcmpeqb %xmm2, %xmm5
- pcmpeqb %xmm3, %xmm6
- pminub %xmm2, %xmm5
- pminub %xmm3, %xmm6
- pminub %xmm5, %xmm0
- pminub %xmm6, %xmm0
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm0, %ecx
- testl %ecx, %ecx
- je L(loop)
- pcmpeqb %xmm7, %xmm5
- movdqu (%rdx), %xmm0
- pcmpeqb %xmm7, %xmm1
- movdqa (%rax), %xmm2
- pcmpeqb %xmm2, %xmm0
- pminub %xmm2, %xmm0
- pcmpeqb %xmm7, %xmm6
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm1, %ecx
- pmovmskb %xmm5, %r8d
- pmovmskb %xmm0, %edi
- salq $16, %rcx
- salq $32, %r8
- pmovmskb %xmm6, %esi
- orq %r8, %rcx
- orq %rdi, %rcx
- salq $48, %rsi
- orq %rsi, %rcx
- bsfq %rcx, %rcx
- movzbl (%rax, %rcx), %eax
- movzbl (%rdx, %rcx), %edx
- subl %edx, %eax
- ret
-
- .p2align 4
-L(loop_cross_page):
- xor %r10, %r10
- movq %rdx, %r9
- and $63, %r9
- subq %r9, %r10
-
- movdqa (%rdx, %r10), %xmm0
- movdqa 16(%rdx, %r10), %xmm1
- movdqu (%rax, %r10), %xmm2
- movdqu 16(%rax, %r10), %xmm3
- pcmpeqb %xmm2, %xmm0
- movdqa 32(%rdx, %r10), %xmm5
- pcmpeqb %xmm3, %xmm1
- pminub %xmm2, %xmm0
- movdqa 48(%rdx, %r10), %xmm6
- pminub %xmm3, %xmm1
- movdqu 32(%rax, %r10), %xmm2
- movdqu 48(%rax, %r10), %xmm3
- pcmpeqb %xmm2, %xmm5
- pcmpeqb %xmm3, %xmm6
- pminub %xmm2, %xmm5
- pminub %xmm3, %xmm6
-
- pcmpeqb %xmm7, %xmm0
- pcmpeqb %xmm7, %xmm1
- pcmpeqb %xmm7, %xmm5
- pcmpeqb %xmm7, %xmm6
-
- pmovmskb %xmm1, %ecx
- pmovmskb %xmm5, %r8d
- pmovmskb %xmm0, %edi
- salq $16, %rcx
- salq $32, %r8
- pmovmskb %xmm6, %esi
- orq %r8, %rdi
- orq %rcx, %rdi
- salq $48, %rsi
- orq %rsi, %rdi
- movq %r9, %rcx
- movq $63, %rsi
- shrq %cl, %rdi
- test %rdi, %rdi
- je L(back_to_loop)
- bsfq %rdi, %rcx
- movzbl (%rax, %rcx), %eax
- movzbl (%rdx, %rcx), %edx
- subl %edx, %eax
- ret
-
- .p2align 4
-L(cross_page_loop):
- cmpb %cl, %al
- jne L(different)
- addq $1, %rdx
- cmpq $64, %rdx
- je L(main_loop_header)
-L(cross_page):
- movzbl (%rdi, %rdx), %eax
- movzbl (%rsi, %rdx), %ecx
- testb %al, %al
- jne L(cross_page_loop)
- xorl %eax, %eax
-L(different):
- subl %ecx, %eax
- ret
-END (__strcmp_sse2_unaligned)
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
deleted file mode 100644
index ed26d4a8fb..0000000000
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
+++ /dev/null
@@ -1,1792 +0,0 @@
-/* strcmp with SSE4.2
- Copyright (C) 2009-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-
-/* We use 0x1a:
- _SIDD_SBYTE_OPS
- | _SIDD_CMP_EQUAL_EACH
- | _SIDD_NEGATIVE_POLARITY
- | _SIDD_LEAST_SIGNIFICANT
- on pcmpistri to find out if two 16byte data elements are the same
- and the offset of the first different byte. There are 4 cases:
-
- 1. Both 16byte data elements are valid and identical.
- 2. Both 16byte data elements have EOS and identical.
- 3. Both 16byte data elements are valid and they differ at offset X.
- 4. At least one 16byte data element has EOS at offset X. Two 16byte
- data elements must differ at or before offset X.
-
- Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
-
- case ECX CFlag ZFlag SFlag
- 1 16 0 0 0
- 2 16 0 1 1
- 3 X 1 0 0
- 4 0 <= X 1 0/1 0/1
-
- We exit from the loop for cases 2, 3 and 4 with jbe which branches
- when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for
- case 2. */
-
- /* Put all SSE 4.2 functions together. */
- .section .text.SECTION,"ax",@progbits
- .align 16
- .type STRCMP_SSE42, @function
- .globl STRCMP_SSE42
- .hidden STRCMP_SSE42
-#ifdef USE_AS_STRCASECMP_L
-ENTRY (GLABEL(__strcasecmp))
- movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
- mov %fs:(%rax),%RDX_LP
-
- // XXX 5 byte should be before the function
- /* 5-byte NOP. */
- .byte 0x0f,0x1f,0x44,0x00,0x00
-END (GLABEL(__strcasecmp))
- /* FALLTHROUGH to strcasecmp_l. */
-#endif
-#ifdef USE_AS_STRNCASECMP_L
-ENTRY (GLABEL(__strncasecmp))
- movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
- mov %fs:(%rax),%RCX_LP
-
- // XXX 5 byte should be before the function
- /* 5-byte NOP. */
- .byte 0x0f,0x1f,0x44,0x00,0x00
-END (GLABEL(__strncasecmp))
- /* FALLTHROUGH to strncasecmp_l. */
-#endif
-
-
-#ifdef USE_AVX
-# define movdqa vmovdqa
-# define movdqu vmovdqu
-# define pmovmskb vpmovmskb
-# define pcmpistri vpcmpistri
-# define psubb vpsubb
-# define pcmpeqb vpcmpeqb
-# define psrldq vpsrldq
-# define pslldq vpslldq
-# define palignr vpalignr
-# define pxor vpxor
-# define D(arg) arg, arg
-#else
-# define D(arg) arg
-#endif
-
-STRCMP_SSE42:
- cfi_startproc
- CALL_MCOUNT
-
-/*
- * This implementation uses SSE to compare up to 16 bytes at a time.
- */
-#ifdef USE_AS_STRCASECMP_L
- /* We have to fall back on the C implementation for locales
- with encodings not matching ASCII for single bytes. */
-# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
- mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
-# else
- mov (%rdx), %RAX_LP
-# endif
- testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
- jne __strcasecmp_l_nonascii
-#endif
-#ifdef USE_AS_STRNCASECMP_L
- /* We have to fall back on the C implementation for locales
- with encodings not matching ASCII for single bytes. */
-# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
- mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
-# else
- mov (%rcx), %RAX_LP
-# endif
- testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
- jne __strncasecmp_l_nonascii
-#endif
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- test %rdx, %rdx
- je LABEL(strcmp_exitz)
- cmp $1, %rdx
- je LABEL(Byte0)
- mov %rdx, %r11
-#endif
- mov %esi, %ecx
- mov %edi, %eax
-/* Use 64bit AND here to avoid long NOP padding. */
- and $0x3f, %rcx /* rsi alignment in cache line */
- and $0x3f, %rax /* rdi alignment in cache line */
-#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
- .section .rodata.cst16,"aM",@progbits,16
- .align 16
-LABEL(belowupper):
- .quad 0x4040404040404040
- .quad 0x4040404040404040
-LABEL(topupper):
-# ifdef USE_AVX
- .quad 0x5a5a5a5a5a5a5a5a
- .quad 0x5a5a5a5a5a5a5a5a
-# else
- .quad 0x5b5b5b5b5b5b5b5b
- .quad 0x5b5b5b5b5b5b5b5b
-# endif
-LABEL(touppermask):
- .quad 0x2020202020202020
- .quad 0x2020202020202020
- .previous
- movdqa LABEL(belowupper)(%rip), %xmm4
-# define UCLOW_reg %xmm4
- movdqa LABEL(topupper)(%rip), %xmm5
-# define UCHIGH_reg %xmm5
- movdqa LABEL(touppermask)(%rip), %xmm6
-# define LCQWORD_reg %xmm6
-#endif
- cmp $0x30, %ecx
- ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
- cmp $0x30, %eax
- ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */
- movdqu (%rdi), %xmm1
- movdqu (%rsi), %xmm2
-#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
-# ifdef USE_AVX
-# define TOLOWER(reg1, reg2) \
- vpcmpgtb UCLOW_reg, reg1, %xmm7; \
- vpcmpgtb UCHIGH_reg, reg1, %xmm8; \
- vpcmpgtb UCLOW_reg, reg2, %xmm9; \
- vpcmpgtb UCHIGH_reg, reg2, %xmm10; \
- vpandn %xmm7, %xmm8, %xmm8; \
- vpandn %xmm9, %xmm10, %xmm10; \
- vpand LCQWORD_reg, %xmm8, %xmm8; \
- vpand LCQWORD_reg, %xmm10, %xmm10; \
- vpor reg1, %xmm8, reg1; \
- vpor reg2, %xmm10, reg2
-# else
-# define TOLOWER(reg1, reg2) \
- movdqa reg1, %xmm7; \
- movdqa UCHIGH_reg, %xmm8; \
- movdqa reg2, %xmm9; \
- movdqa UCHIGH_reg, %xmm10; \
- pcmpgtb UCLOW_reg, %xmm7; \
- pcmpgtb reg1, %xmm8; \
- pcmpgtb UCLOW_reg, %xmm9; \
- pcmpgtb reg2, %xmm10; \
- pand %xmm8, %xmm7; \
- pand %xmm10, %xmm9; \
- pand LCQWORD_reg, %xmm7; \
- pand LCQWORD_reg, %xmm9; \
- por %xmm7, reg1; \
- por %xmm9, reg2
-# endif
- TOLOWER (%xmm1, %xmm2)
-#else
-# define TOLOWER(reg1, reg2)
-#endif
- pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */
- pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
- pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */
- psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
- jnz LABEL(less16bytes)/* If not, find different value or null char */
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)/* finish comparison */
-#endif
- add $16, %rsi /* prepare to search next 16 bytes */
- add $16, %rdi /* prepare to search next 16 bytes */
-
- /*
- * Determine source and destination string offsets from 16-byte
- * alignment. Use relative offset difference between the two to
- * determine which case below to use.
- */
- .p2align 4
-LABEL(crosscache):
- and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
- and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
- mov $0xffff, %edx /* for equivalent offset */
- xor %r8d, %r8d
- and $0xf, %ecx /* offset of rsi */
- and $0xf, %eax /* offset of rdi */
- pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */
- cmp %eax, %ecx
- je LABEL(ashr_0) /* rsi and rdi relative offset same */
- ja LABEL(bigger)
- mov %edx, %r8d /* r8d is offset flag for exit tail */
- xchg %ecx, %eax
- xchg %rsi, %rdi
-LABEL(bigger):
- movdqa (%rdi), %xmm2
- movdqa (%rsi), %xmm1
- lea 15(%rax), %r9
- sub %rcx, %r9
- lea LABEL(unaligned_table)(%rip), %r10
- movslq (%r10, %r9,4), %r9
- pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
- lea (%r10, %r9), %r10
- jmp *%r10 /* jump to corresponding case */
-
-/*
- * The following cases will be handled by ashr_0
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(0~15) n(0~15) 15(15+ n-n) ashr_0
- */
- .p2align 4
-LABEL(ashr_0):
-
- movdqa (%rsi), %xmm1
- pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */
-#else
- movdqa (%rdi), %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */
-#endif
- psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
- pmovmskb %xmm1, %r9d
- shr %cl, %edx /* adjust 0xffff for offset */
- shr %cl, %r9d /* adjust for 16-byte offset */
- sub %r9d, %edx
- /*
- * edx must be the same with r9d if in left byte (16-rcx) is equal to
- * the start from (16-rax) and no null char was seen.
- */
- jne LABEL(less32bytes) /* mismatch or null char */
- UPDATE_STRNCMP_COUNTER
- mov $16, %rcx
- mov $16, %r9
-
- /*
- * Now both strings are aligned at 16-byte boundary. Loop over strings
- * checking 32-bytes per iteration.
- */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
- .p2align 4
-LABEL(ashr_0_use):
- movdqa (%rdi,%rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- lea 16(%rdx), %rdx
- jbe LABEL(ashr_0_exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- movdqa (%rdi,%rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- lea 16(%rdx), %rdx
- jbe LABEL(ashr_0_exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- jmp LABEL(ashr_0_use)
-
-
- .p2align 4
-LABEL(ashr_0_exit_use):
- jnc LABEL(strcmp_exitz)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub %rcx, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- lea -16(%rdx, %rcx), %rcx
- movzbl (%rdi, %rcx), %eax
- movzbl (%rsi, %rcx), %edx
-#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
- leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
- movl (%rcx,%rax,4), %eax
- movl (%rcx,%rdx,4), %edx
-#endif
- sub %edx, %eax
- ret
-
-
-
-/*
- * The following cases will be handled by ashr_1
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(15) n -15 0(15 +(n-15) - n) ashr_1
- */
- .p2align 4
-LABEL(ashr_1):
- pslldq $15, D(%xmm2) /* shift first string to align with second */
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */
- psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/
- pmovmskb %xmm2, %r9d
- shr %cl, %edx /* adjust 0xffff for offset */
- shr %cl, %r9d /* adjust for 16-byte offset */
- sub %r9d, %edx
- jnz LABEL(less32bytes) /* mismatch or null char seen */
- movdqa (%rdi), %xmm3
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads*/
- mov $1, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 1(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_1_use):
- add $16, %r10
- jg LABEL(nibble_ashr_1_use)
-
-LABEL(nibble_ashr_1_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $1, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_1_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $1, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_1_use)
-
- .p2align 4
-LABEL(nibble_ashr_1_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $1, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $14, %ecx
- ja LABEL(nibble_ashr_1_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_2
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
- */
- .p2align 4
-LABEL(ashr_2):
- pslldq $14, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $2, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 2(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_2_use):
- add $16, %r10
- jg LABEL(nibble_ashr_2_use)
-
-LABEL(nibble_ashr_2_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $2, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_2_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $2, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_2_use)
-
- .p2align 4
-LABEL(nibble_ashr_2_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $2, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $13, %ecx
- ja LABEL(nibble_ashr_2_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_3
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
- */
- .p2align 4
-LABEL(ashr_3):
- pslldq $13, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $3, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 3(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
-LABEL(loop_ashr_3_use):
- add $16, %r10
- jg LABEL(nibble_ashr_3_use)
-
-LABEL(nibble_ashr_3_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $3, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_3_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $3, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_3_use)
-
- .p2align 4
-LABEL(nibble_ashr_3_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $3, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $12, %ecx
- ja LABEL(nibble_ashr_3_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_4
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
- */
- .p2align 4
-LABEL(ashr_4):
- pslldq $12, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $4, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 4(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_4_use):
- add $16, %r10
- jg LABEL(nibble_ashr_4_use)
-
-LABEL(nibble_ashr_4_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $4, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_4_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $4, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_4_use)
-
- .p2align 4
-LABEL(nibble_ashr_4_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $4, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $11, %ecx
- ja LABEL(nibble_ashr_4_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_5
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
- */
- .p2align 4
-LABEL(ashr_5):
- pslldq $11, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $5, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 5(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_5_use):
- add $16, %r10
- jg LABEL(nibble_ashr_5_use)
-
-LABEL(nibble_ashr_5_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $5, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_5_use)
-
- movdqa (%rdi, %rdx), %xmm0
-
- palignr $5, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_5_use)
-
- .p2align 4
-LABEL(nibble_ashr_5_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $5, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $10, %ecx
- ja LABEL(nibble_ashr_5_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_6
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
- */
- .p2align 4
-LABEL(ashr_6):
- pslldq $10, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $6, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 6(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_6_use):
- add $16, %r10
- jg LABEL(nibble_ashr_6_use)
-
-LABEL(nibble_ashr_6_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $6, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_6_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $6, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_6_use)
-
- .p2align 4
-LABEL(nibble_ashr_6_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $6, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $9, %ecx
- ja LABEL(nibble_ashr_6_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_7
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
- */
- .p2align 4
-LABEL(ashr_7):
- pslldq $9, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $7, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 7(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_7_use):
- add $16, %r10
- jg LABEL(nibble_ashr_7_use)
-
-LABEL(nibble_ashr_7_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $7, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_7_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $7, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_7_use)
-
- .p2align 4
-LABEL(nibble_ashr_7_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $7, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $8, %ecx
- ja LABEL(nibble_ashr_7_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_8
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
- */
- .p2align 4
-LABEL(ashr_8):
- pslldq $8, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $8, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 8(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_8_use):
- add $16, %r10
- jg LABEL(nibble_ashr_8_use)
-
-LABEL(nibble_ashr_8_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $8, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_8_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $8, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_8_use)
-
- .p2align 4
-LABEL(nibble_ashr_8_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $8, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $7, %ecx
- ja LABEL(nibble_ashr_8_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_9
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
- */
- .p2align 4
-LABEL(ashr_9):
- pslldq $7, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $9, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 9(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_9_use):
- add $16, %r10
- jg LABEL(nibble_ashr_9_use)
-
-LABEL(nibble_ashr_9_restart_use):
- movdqa (%rdi, %rdx), %xmm0
-
- palignr $9, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_9_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $9, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_9_use)
-
- .p2align 4
-LABEL(nibble_ashr_9_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $9, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $6, %ecx
- ja LABEL(nibble_ashr_9_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_10
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
- */
- .p2align 4
-LABEL(ashr_10):
- pslldq $6, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $10, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 10(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_10_use):
- add $16, %r10
- jg LABEL(nibble_ashr_10_use)
-
-LABEL(nibble_ashr_10_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $10, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_10_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $10, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_10_use)
-
- .p2align 4
-LABEL(nibble_ashr_10_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $10, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $5, %ecx
- ja LABEL(nibble_ashr_10_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_11
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
- */
- .p2align 4
-LABEL(ashr_11):
- pslldq $5, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $11, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 11(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_11_use):
- add $16, %r10
- jg LABEL(nibble_ashr_11_use)
-
-LABEL(nibble_ashr_11_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $11, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_11_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $11, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_11_use)
-
- .p2align 4
-LABEL(nibble_ashr_11_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $11, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $4, %ecx
- ja LABEL(nibble_ashr_11_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_12
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
- */
- .p2align 4
-LABEL(ashr_12):
- pslldq $4, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $12, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 12(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_12_use):
- add $16, %r10
- jg LABEL(nibble_ashr_12_use)
-
-LABEL(nibble_ashr_12_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $12, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_12_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $12, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_12_use)
-
- .p2align 4
-LABEL(nibble_ashr_12_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $12, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $3, %ecx
- ja LABEL(nibble_ashr_12_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_13
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
- */
- .p2align 4
-LABEL(ashr_13):
- pslldq $3, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $13, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 13(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
-
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_13_use):
- add $16, %r10
- jg LABEL(nibble_ashr_13_use)
-
-LABEL(nibble_ashr_13_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $13, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_13_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $13, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_13_use)
-
- .p2align 4
-LABEL(nibble_ashr_13_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $13, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $2, %ecx
- ja LABEL(nibble_ashr_13_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_14
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
- */
- .p2align 4
-LABEL(ashr_14):
- pslldq $2, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $14, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 14(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
-
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_14_use):
- add $16, %r10
- jg LABEL(nibble_ashr_14_use)
-
-LABEL(nibble_ashr_14_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $14, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_14_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $14, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_14_use)
-
- .p2align 4
-LABEL(nibble_ashr_14_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $14, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $1, %ecx
- ja LABEL(nibble_ashr_14_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_15
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
- */
- .p2align 4
-LABEL(ashr_15):
- pslldq $1, D(%xmm2)
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
-
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $15, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 15(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
-
- sub $0x1000, %r10 /* subtract 4K pagesize */
-
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_15_use):
- add $16, %r10
- jg LABEL(nibble_ashr_15_use)
-
-LABEL(nibble_ashr_15_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $15, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_15_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $15, -16(%rdi, %rdx), D(%xmm0)
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_15_use)
-
- .p2align 4
-LABEL(nibble_ashr_15_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $15, D(%xmm0)
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $0, %ecx
- ja LABEL(nibble_ashr_15_restart_use)
-
-LABEL(nibble_ashr_exit_use):
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- .p2align 4
-LABEL(exit_use):
- jnc LABEL(strcmp_exitz)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub %rcx, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add %rcx, %rdx
- lea -16(%rdi, %r9), %rdi
- movzbl (%rdi, %rdx), %eax
- movzbl (%rsi, %rdx), %edx
- test %r8d, %r8d
- jz LABEL(ret_use)
- xchg %eax, %edx
-LABEL(ret_use):
-#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
- leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
- movl (%rcx,%rdx,4), %edx
- movl (%rcx,%rax,4), %eax
-#endif
-
- sub %edx, %eax
- ret
-
-LABEL(less32bytes):
- lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
- lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
- test %r8d, %r8d
- jz LABEL(ret)
- xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
-
- .p2align 4
-LABEL(ret):
-LABEL(less16bytes):
- bsf %rdx, %rdx /* find and store bit index in %rdx */
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub %rdx, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- movzbl (%rsi, %rdx), %ecx
- movzbl (%rdi, %rdx), %eax
-
-#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
- leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
- movl (%rdx,%rcx,4), %ecx
- movl (%rdx,%rax,4), %eax
-#endif
-
- sub %ecx, %eax
- ret
-
-LABEL(strcmp_exitz):
- xor %eax, %eax
- ret
-
- .p2align 4
- // XXX Same as code above
-LABEL(Byte0):
- movzx (%rsi), %ecx
- movzx (%rdi), %eax
-
-#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
- leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
- movl (%rdx,%rcx,4), %ecx
- movl (%rdx,%rax,4), %eax
-#endif
-
- sub %ecx, %eax
- ret
- cfi_endproc
- .size STRCMP_SSE42, .-STRCMP_SSE42
-
-#undef UCLOW_reg
-#undef UCHIGH_reg
-#undef LCQWORD_reg
-#undef TOLOWER
-
- /* Put all SSE 4.2 functions together. */
- .section .rodata.SECTION,"a",@progbits
- .p2align 3
-LABEL(unaligned_table):
- .int LABEL(ashr_1) - LABEL(unaligned_table)
- .int LABEL(ashr_2) - LABEL(unaligned_table)
- .int LABEL(ashr_3) - LABEL(unaligned_table)
- .int LABEL(ashr_4) - LABEL(unaligned_table)
- .int LABEL(ashr_5) - LABEL(unaligned_table)
- .int LABEL(ashr_6) - LABEL(unaligned_table)
- .int LABEL(ashr_7) - LABEL(unaligned_table)
- .int LABEL(ashr_8) - LABEL(unaligned_table)
- .int LABEL(ashr_9) - LABEL(unaligned_table)
- .int LABEL(ashr_10) - LABEL(unaligned_table)
- .int LABEL(ashr_11) - LABEL(unaligned_table)
- .int LABEL(ashr_12) - LABEL(unaligned_table)
- .int LABEL(ashr_13) - LABEL(unaligned_table)
- .int LABEL(ashr_14) - LABEL(unaligned_table)
- .int LABEL(ashr_15) - LABEL(unaligned_table)
- .int LABEL(ashr_0) - LABEL(unaligned_table)
-
-#undef LABEL
-#undef GLABEL
-#undef SECTION
-#undef movdqa
-#undef movdqu
-#undef pmovmskb
-#undef pcmpistri
-#undef psubb
-#undef pcmpeqb
-#undef psrldq
-#undef pslldq
-#undef palignr
-#undef pxor
-#undef D
diff --git a/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/sysdeps/x86_64/multiarch/strcmp-ssse3.S
deleted file mode 100644
index 1b7fa33c91..0000000000
--- a/sysdeps/x86_64/multiarch/strcmp-ssse3.S
+++ /dev/null
@@ -1,5 +0,0 @@
-#if IS_IN (libc)
-# define USE_SSSE3 1
-# define STRCMP __strcmp_ssse3
-# include "../strcmp.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
deleted file mode 100644
index 54f8f7dd44..0000000000
--- a/sysdeps/x86_64/multiarch/strcmp.S
+++ /dev/null
@@ -1,209 +0,0 @@
-/* Multiple versions of strcmp
- Copyright (C) 2009-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-#ifdef USE_AS_STRNCMP
-/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
- if the new counter > the old one or is 0. */
-# define UPDATE_STRNCMP_COUNTER \
- /* calculate left number to compare */ \
- lea -16(%rcx, %r11), %r9; \
- cmp %r9, %r11; \
- jb LABEL(strcmp_exitz); \
- test %r9, %r9; \
- je LABEL(strcmp_exitz); \
- mov %r9, %r11
-
-# define STRCMP_SSE42 __strncmp_sse42
-# define STRCMP_SSSE3 __strncmp_ssse3
-# define STRCMP_SSE2 __strncmp_sse2
-# define __GI_STRCMP __GI_strncmp
-#elif defined USE_AS_STRCASECMP_L
-# include "locale-defines.h"
-
-# define UPDATE_STRNCMP_COUNTER
-
-# define STRCMP_AVX __strcasecmp_l_avx
-# define STRCMP_SSE42 __strcasecmp_l_sse42
-# define STRCMP_SSSE3 __strcasecmp_l_ssse3
-# define STRCMP_SSE2 __strcasecmp_l_sse2
-# define __GI_STRCMP __GI___strcasecmp_l
-#elif defined USE_AS_STRNCASECMP_L
-# include "locale-defines.h"
-
-/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
- if the new counter > the old one or is 0. */
-# define UPDATE_STRNCMP_COUNTER \
- /* calculate left number to compare */ \
- lea -16(%rcx, %r11), %r9; \
- cmp %r9, %r11; \
- jb LABEL(strcmp_exitz); \
- test %r9, %r9; \
- je LABEL(strcmp_exitz); \
- mov %r9, %r11
-
-# define STRCMP_AVX __strncasecmp_l_avx
-# define STRCMP_SSE42 __strncasecmp_l_sse42
-# define STRCMP_SSSE3 __strncasecmp_l_ssse3
-# define STRCMP_SSE2 __strncasecmp_l_sse2
-# define __GI_STRCMP __GI___strncasecmp_l
-#else
-# define USE_AS_STRCMP
-# define UPDATE_STRNCMP_COUNTER
-# ifndef STRCMP
-# define STRCMP strcmp
-# define STRCMP_SSE42 __strcmp_sse42
-# define STRCMP_SSSE3 __strcmp_ssse3
-# define STRCMP_SSE2 __strcmp_sse2
-# define __GI_STRCMP __GI_strcmp
-# endif
-#endif
-
-/* Define multiple versions only for the definition in libc. Don't
- define multiple versions for strncmp in static library since we
- need strncmp before the initialization happened. */
-#if (defined SHARED || !defined USE_AS_STRNCMP) && IS_IN (libc)
- .text
-ENTRY(STRCMP)
- .type STRCMP, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
-#ifdef USE_AS_STRCMP
- leaq __strcmp_sse2_unaligned(%rip), %rax
- HAS_ARCH_FEATURE (Fast_Unaligned_Load)
- jnz 3f
-#else
- HAS_ARCH_FEATURE (Slow_SSE4_2)
- jnz 2f
- leaq STRCMP_SSE42(%rip), %rax
- HAS_CPU_FEATURE (SSE4_2)
- jnz 3f
-#endif
-2: leaq STRCMP_SSSE3(%rip), %rax
- HAS_CPU_FEATURE (SSSE3)
- jnz 3f
- leaq STRCMP_SSE2(%rip), %rax
-3: ret
-END(STRCMP)
-
-# ifdef USE_AS_STRCASECMP_L
-ENTRY(__strcasecmp)
- .type __strcasecmp, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- leaq __strcasecmp_avx(%rip), %rax
- HAS_ARCH_FEATURE (AVX_Usable)
- jnz 3f
- HAS_ARCH_FEATURE (Slow_SSE4_2)
- jnz 2f
- leaq __strcasecmp_sse42(%rip), %rax
- HAS_CPU_FEATURE (SSE4_2)
- jnz 3f
-2: leaq __strcasecmp_ssse3(%rip), %rax
- HAS_CPU_FEATURE (SSSE3)
- jnz 3f
- leaq __strcasecmp_sse2(%rip), %rax
-3: ret
-END(__strcasecmp)
-weak_alias (__strcasecmp, strcasecmp)
-# endif
-# ifdef USE_AS_STRNCASECMP_L
-ENTRY(__strncasecmp)
- .type __strncasecmp, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- leaq __strncasecmp_avx(%rip), %rax
- HAS_ARCH_FEATURE (AVX_Usable)
- jnz 3f
- HAS_ARCH_FEATURE (Slow_SSE4_2)
- jnz 2f
- leaq __strncasecmp_sse42(%rip), %rax
- HAS_CPU_FEATURE (SSE4_2)
- jnz 3f
-2: leaq __strncasecmp_ssse3(%rip), %rax
- HAS_CPU_FEATURE (SSSE3)
- jnz 3f
- leaq __strncasecmp_sse2(%rip), %rax
-3: ret
-END(__strncasecmp)
-weak_alias (__strncasecmp, strncasecmp)
-# endif
-
-# undef LABEL
-# define LABEL(l) .L##l##_sse42
-# define GLABEL(l) l##_sse42
-# define SECTION sse4.2
-# include "strcmp-sse42.S"
-
-
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
-# define LABEL(l) .L##l##_avx
-# define GLABEL(l) l##_avx
-# define USE_AVX 1
-# undef STRCMP_SSE42
-# define STRCMP_SSE42 STRCMP_AVX
-# define SECTION avx
-# include "strcmp-sse42.S"
-# endif
-
-
-# undef ENTRY
-# define ENTRY(name) \
- .type STRCMP_SSE2, @function; \
- .align 16; \
- .globl STRCMP_SSE2; \
- .hidden STRCMP_SSE2; \
- STRCMP_SSE2: cfi_startproc; \
- CALL_MCOUNT
-# undef END
-# define END(name) \
- cfi_endproc; .size STRCMP_SSE2, .-STRCMP_SSE2
-
-# ifdef USE_AS_STRCASECMP_L
-# define ENTRY2(name) \
- .type __strcasecmp_sse2, @function; \
- .align 16; \
- .globl __strcasecmp_sse2; \
- .hidden __strcasecmp_sse2; \
- __strcasecmp_sse2: cfi_startproc; \
- CALL_MCOUNT
-# define END2(name) \
- cfi_endproc; .size __strcasecmp_sse2, .-__strcasecmp_sse2
-# endif
-
-# ifdef USE_AS_STRNCASECMP_L
-# define ENTRY2(name) \
- .type __strncasecmp_sse2, @function; \
- .align 16; \
- .globl __strncasecmp_sse2; \
- .hidden __strncasecmp_sse2; \
- __strncasecmp_sse2: cfi_startproc; \
- CALL_MCOUNT
-# define END2(name) \
- cfi_endproc; .size __strncasecmp_sse2, .-__strncasecmp_sse2
-# endif
-
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal strcmp calls through a PLT.
- The speedup we get from using SSE4.2 instruction is likely eaten away
- by the indirect call in the PLT. */
-# define libc_hidden_builtin_def(name) \
- .globl __GI_STRCMP; __GI_STRCMP = STRCMP_SSE2
-#endif
-
-#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
deleted file mode 100644
index 6a5ab7ab26..0000000000
--- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
+++ /dev/null
@@ -1,1889 +0,0 @@
-/* strcpy with SSE2 and unaligned load
- Copyright (C) 2011-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# ifndef USE_AS_STRCAT
-# include <sysdep.h>
-
-# ifndef STRCPY
-# define STRCPY __strcpy_sse2_unaligned
-# endif
-
-# endif
-
-# define JMPTBL(I, B) I - B
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- lea TABLE(%rip), %r11; \
- movslq (%r11, INDEX, SCALE), %rcx; \
- lea (%r11, %rcx), %rcx; \
- jmp *%rcx
-
-# ifndef USE_AS_STRCAT
-
-.text
-ENTRY (STRCPY)
-# ifdef USE_AS_STRNCPY
- mov %rdx, %r8
- test %r8, %r8
- jz L(ExitZero)
-# endif
- mov %rsi, %rcx
-# ifndef USE_AS_STPCPY
- mov %rdi, %rax /* save result */
-# endif
-
-# endif
-
- and $63, %rcx
- cmp $32, %rcx
- jbe L(SourceStringAlignmentLess32)
-
- and $-16, %rsi
- and $15, %rcx
- pxor %xmm0, %xmm0
- pxor %xmm1, %xmm1
-
- pcmpeqb (%rsi), %xmm1
- pmovmskb %xmm1, %rdx
- shr %cl, %rdx
-
-# ifdef USE_AS_STRNCPY
-# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
- mov $16, %r10
- sub %rcx, %r10
- cmp %r10, %r8
-# else
- mov $17, %r10
- sub %rcx, %r10
- cmp %r10, %r8
-# endif
- jbe L(CopyFrom1To16BytesTailCase2OrCase3)
-# endif
- test %rdx, %rdx
- jnz L(CopyFrom1To16BytesTail)
-
- pcmpeqb 16(%rsi), %xmm0
- pmovmskb %xmm0, %rdx
-
-# ifdef USE_AS_STRNCPY
- add $16, %r10
- cmp %r10, %r8
- jbe L(CopyFrom1To32BytesCase2OrCase3)
-# endif
- test %rdx, %rdx
- jnz L(CopyFrom1To32Bytes)
-
- movdqu (%rsi, %rcx), %xmm1 /* copy 16 bytes */
- movdqu %xmm1, (%rdi)
-
-/* If source address alignment != destination address alignment */
- .p2align 4
-L(Unalign16Both):
- sub %rcx, %rdi
-# ifdef USE_AS_STRNCPY
- add %rcx, %r8
- sbb %rcx, %rcx
- or %rcx, %r8
-# endif
- mov $16, %rcx
- movdqa (%rsi, %rcx), %xmm1
- movaps 16(%rsi, %rcx), %xmm2
- movdqu %xmm1, (%rdi, %rcx)
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rdx
- add $16, %rcx
-# ifdef USE_AS_STRNCPY
- sub $48, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rdx, %rdx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyFrom1To16BytesUnalignedXmm2)
-# else
- jnz L(CopyFrom1To16Bytes)
-# endif
-
- movaps 16(%rsi, %rcx), %xmm3
- movdqu %xmm2, (%rdi, %rcx)
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %rdx
- add $16, %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rdx, %rdx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyFrom1To16BytesUnalignedXmm3)
-# else
- jnz L(CopyFrom1To16Bytes)
-# endif
-
- movaps 16(%rsi, %rcx), %xmm4
- movdqu %xmm3, (%rdi, %rcx)
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rdx
- add $16, %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rdx, %rdx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyFrom1To16BytesUnalignedXmm4)
-# else
- jnz L(CopyFrom1To16Bytes)
-# endif
-
- movaps 16(%rsi, %rcx), %xmm1
- movdqu %xmm4, (%rdi, %rcx)
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %rdx
- add $16, %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rdx, %rdx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyFrom1To16BytesUnalignedXmm1)
-# else
- jnz L(CopyFrom1To16Bytes)
-# endif
-
- movaps 16(%rsi, %rcx), %xmm2
- movdqu %xmm1, (%rdi, %rcx)
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rdx
- add $16, %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rdx, %rdx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyFrom1To16BytesUnalignedXmm2)
-# else
- jnz L(CopyFrom1To16Bytes)
-# endif
-
- movaps 16(%rsi, %rcx), %xmm3
- movdqu %xmm2, (%rdi, %rcx)
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %rdx
- add $16, %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rdx, %rdx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyFrom1To16BytesUnalignedXmm3)
-# else
- jnz L(CopyFrom1To16Bytes)
-# endif
-
- movdqu %xmm3, (%rdi, %rcx)
- mov %rsi, %rdx
- lea 16(%rsi, %rcx), %rsi
- and $-0x40, %rsi
- sub %rsi, %rdx
- sub %rdx, %rdi
-# ifdef USE_AS_STRNCPY
- lea 128(%r8, %rdx), %r8
-# endif
-L(Unaligned64Loop):
- movaps (%rsi), %xmm2
- movaps %xmm2, %xmm4
- movaps 16(%rsi), %xmm5
- movaps 32(%rsi), %xmm3
- movaps %xmm3, %xmm6
- movaps 48(%rsi), %xmm7
- pminub %xmm5, %xmm2
- pminub %xmm7, %xmm3
- pminub %xmm2, %xmm3
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %rdx
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(UnalignedLeaveCase2OrCase3)
-# endif
- test %rdx, %rdx
- jnz L(Unaligned64Leave)
-
-L(Unaligned64Loop_start):
- add $64, %rdi
- add $64, %rsi
- movdqu %xmm4, -64(%rdi)
- movaps (%rsi), %xmm2
- movdqa %xmm2, %xmm4
- movdqu %xmm5, -48(%rdi)
- movaps 16(%rsi), %xmm5
- pminub %xmm5, %xmm2
- movaps 32(%rsi), %xmm3
- movdqu %xmm6, -32(%rdi)
- movaps %xmm3, %xmm6
- movdqu %xmm7, -16(%rdi)
- movaps 48(%rsi), %xmm7
- pminub %xmm7, %xmm3
- pminub %xmm2, %xmm3
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %rdx
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(UnalignedLeaveCase2OrCase3)
-# endif
- test %rdx, %rdx
- jz L(Unaligned64Loop_start)
-
-L(Unaligned64Leave):
- pxor %xmm1, %xmm1
-
- pcmpeqb %xmm4, %xmm0
- pcmpeqb %xmm5, %xmm1
- pmovmskb %xmm0, %rdx
- pmovmskb %xmm1, %rcx
- test %rdx, %rdx
- jnz L(CopyFrom1To16BytesUnaligned_0)
- test %rcx, %rcx
- jnz L(CopyFrom1To16BytesUnaligned_16)
-
- pcmpeqb %xmm6, %xmm0
- pcmpeqb %xmm7, %xmm1
- pmovmskb %xmm0, %rdx
- pmovmskb %xmm1, %rcx
- test %rdx, %rdx
- jnz L(CopyFrom1To16BytesUnaligned_32)
-
- bsf %rcx, %rdx
- movdqu %xmm4, (%rdi)
- movdqu %xmm5, 16(%rdi)
- movdqu %xmm6, 32(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
- lea 48(%rdi, %rdx), %rax
-# endif
- movdqu %xmm7, 48(%rdi)
- add $15, %r8
- sub %rdx, %r8
- lea 49(%rdi, %rdx), %rdi
- jmp L(StrncpyFillTailWithZero)
-# else
- add $48, %rsi
- add $48, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
-# endif
-
-/* If source address alignment == destination address alignment */
-
-L(SourceStringAlignmentLess32):
- pxor %xmm0, %xmm0
- movdqu (%rsi), %xmm1
- movdqu 16(%rsi), %xmm2
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %rdx
-
-# ifdef USE_AS_STRNCPY
-# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
- cmp $16, %r8
-# else
- cmp $17, %r8
-# endif
- jbe L(CopyFrom1To16BytesTail1Case2OrCase3)
-# endif
- test %rdx, %rdx
- jnz L(CopyFrom1To16BytesTail1)
-
- pcmpeqb %xmm2, %xmm0
- movdqu %xmm1, (%rdi)
- pmovmskb %xmm0, %rdx
-
-# ifdef USE_AS_STRNCPY
-# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
- cmp $32, %r8
-# else
- cmp $33, %r8
-# endif
- jbe L(CopyFrom1To32Bytes1Case2OrCase3)
-# endif
- test %rdx, %rdx
- jnz L(CopyFrom1To32Bytes1)
-
- and $-16, %rsi
- and $15, %rcx
- jmp L(Unalign16Both)
-
-/*------End of main part with loops---------------------*/
-
-/* Case1 */
-
-# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
- .p2align 4
-L(CopyFrom1To16Bytes):
- add %rcx, %rdi
- add %rcx, %rsi
- bsf %rdx, %rdx
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
-# endif
- .p2align 4
-L(CopyFrom1To16BytesTail):
- add %rcx, %rsi
- bsf %rdx, %rdx
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
-
- .p2align 4
-L(CopyFrom1To32Bytes1):
- add $16, %rsi
- add $16, %rdi
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $16, %r8
-# endif
-L(CopyFrom1To16BytesTail1):
- bsf %rdx, %rdx
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
-
- .p2align 4
-L(CopyFrom1To32Bytes):
- bsf %rdx, %rdx
- add %rcx, %rsi
- add $16, %rdx
- sub %rcx, %rdx
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
-
- .p2align 4
-L(CopyFrom1To16BytesUnaligned_0):
- bsf %rdx, %rdx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
- lea (%rdi, %rdx), %rax
-# endif
- movdqu %xmm4, (%rdi)
- add $63, %r8
- sub %rdx, %r8
- lea 1(%rdi, %rdx), %rdi
- jmp L(StrncpyFillTailWithZero)
-# else
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
-# endif
-
- .p2align 4
-L(CopyFrom1To16BytesUnaligned_16):
- bsf %rcx, %rdx
- movdqu %xmm4, (%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
- lea 16(%rdi, %rdx), %rax
-# endif
- movdqu %xmm5, 16(%rdi)
- add $47, %r8
- sub %rdx, %r8
- lea 17(%rdi, %rdx), %rdi
- jmp L(StrncpyFillTailWithZero)
-# else
- add $16, %rsi
- add $16, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
-# endif
-
- .p2align 4
-L(CopyFrom1To16BytesUnaligned_32):
- bsf %rdx, %rdx
- movdqu %xmm4, (%rdi)
- movdqu %xmm5, 16(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
- lea 32(%rdi, %rdx), %rax
-# endif
- movdqu %xmm6, 32(%rdi)
- add $31, %r8
- sub %rdx, %r8
- lea 33(%rdi, %rdx), %rdi
- jmp L(StrncpyFillTailWithZero)
-# else
- add $32, %rsi
- add $32, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
-# endif
-
-# ifdef USE_AS_STRNCPY
-# ifndef USE_AS_STRCAT
- .p2align 4
-L(CopyFrom1To16BytesUnalignedXmm6):
- movdqu %xmm6, (%rdi, %rcx)
- jmp L(CopyFrom1To16BytesXmmExit)
-
- .p2align 4
-L(CopyFrom1To16BytesUnalignedXmm5):
- movdqu %xmm5, (%rdi, %rcx)
- jmp L(CopyFrom1To16BytesXmmExit)
-
- .p2align 4
-L(CopyFrom1To16BytesUnalignedXmm4):
- movdqu %xmm4, (%rdi, %rcx)
- jmp L(CopyFrom1To16BytesXmmExit)
-
- .p2align 4
-L(CopyFrom1To16BytesUnalignedXmm3):
- movdqu %xmm3, (%rdi, %rcx)
- jmp L(CopyFrom1To16BytesXmmExit)
-
- .p2align 4
-L(CopyFrom1To16BytesUnalignedXmm1):
- movdqu %xmm1, (%rdi, %rcx)
- jmp L(CopyFrom1To16BytesXmmExit)
-# endif
-
- .p2align 4
-L(CopyFrom1To16BytesExit):
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
-
-/* Case2 */
-
- .p2align 4
-L(CopyFrom1To16BytesCase2):
- add $16, %r8
- add %rcx, %rdi
- add %rcx, %rsi
- bsf %rdx, %rdx
- cmp %r8, %rdx
- jb L(CopyFrom1To16BytesExit)
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
-
- .p2align 4
-L(CopyFrom1To32BytesCase2):
- add %rcx, %rsi
- bsf %rdx, %rdx
- add $16, %rdx
- sub %rcx, %rdx
- cmp %r8, %rdx
- jb L(CopyFrom1To16BytesExit)
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
-
-L(CopyFrom1To16BytesTailCase2):
- add %rcx, %rsi
- bsf %rdx, %rdx
- cmp %r8, %rdx
- jb L(CopyFrom1To16BytesExit)
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
-
-L(CopyFrom1To16BytesTail1Case2):
- bsf %rdx, %rdx
- cmp %r8, %rdx
- jb L(CopyFrom1To16BytesExit)
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
-
-/* Case2 or Case3, Case3 */
-
- .p2align 4
-L(CopyFrom1To16BytesCase2OrCase3):
- test %rdx, %rdx
- jnz L(CopyFrom1To16BytesCase2)
-L(CopyFrom1To16BytesCase3):
- add $16, %r8
- add %rcx, %rdi
- add %rcx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
-
- .p2align 4
-L(CopyFrom1To32BytesCase2OrCase3):
- test %rdx, %rdx
- jnz L(CopyFrom1To32BytesCase2)
- add %rcx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
-
- .p2align 4
-L(CopyFrom1To16BytesTailCase2OrCase3):
- test %rdx, %rdx
- jnz L(CopyFrom1To16BytesTailCase2)
- add %rcx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
-
- .p2align 4
-L(CopyFrom1To32Bytes1Case2OrCase3):
- add $16, %rdi
- add $16, %rsi
- sub $16, %r8
-L(CopyFrom1To16BytesTail1Case2OrCase3):
- test %rdx, %rdx
- jnz L(CopyFrom1To16BytesTail1Case2)
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
-
-# endif
-
-/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/
-
- .p2align 4
-L(Exit1):
- mov %dh, (%rdi)
-# ifdef USE_AS_STPCPY
- lea (%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $1, %r8
- lea 1(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit2):
- mov (%rsi), %dx
- mov %dx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 1(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $2, %r8
- lea 2(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit3):
- mov (%rsi), %cx
- mov %cx, (%rdi)
- mov %dh, 2(%rdi)
-# ifdef USE_AS_STPCPY
- lea 2(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $3, %r8
- lea 3(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit4):
- mov (%rsi), %edx
- mov %edx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 3(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $4, %r8
- lea 4(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit5):
- mov (%rsi), %ecx
- mov %dh, 4(%rdi)
- mov %ecx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 4(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $5, %r8
- lea 5(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit6):
- mov (%rsi), %ecx
- mov 4(%rsi), %dx
- mov %ecx, (%rdi)
- mov %dx, 4(%rdi)
-# ifdef USE_AS_STPCPY
- lea 5(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $6, %r8
- lea 6(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit7):
- mov (%rsi), %ecx
- mov 3(%rsi), %edx
- mov %ecx, (%rdi)
- mov %edx, 3(%rdi)
-# ifdef USE_AS_STPCPY
- lea 6(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $7, %r8
- lea 7(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit8):
- mov (%rsi), %rdx
- mov %rdx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 7(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $8, %r8
- lea 8(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit9):
- mov (%rsi), %rcx
- mov %dh, 8(%rdi)
- mov %rcx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 8(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $9, %r8
- lea 9(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit10):
- mov (%rsi), %rcx
- mov 8(%rsi), %dx
- mov %rcx, (%rdi)
- mov %dx, 8(%rdi)
-# ifdef USE_AS_STPCPY
- lea 9(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $10, %r8
- lea 10(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit11):
- mov (%rsi), %rcx
- mov 7(%rsi), %edx
- mov %rcx, (%rdi)
- mov %edx, 7(%rdi)
-# ifdef USE_AS_STPCPY
- lea 10(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $11, %r8
- lea 11(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit12):
- mov (%rsi), %rcx
- mov 8(%rsi), %edx
- mov %rcx, (%rdi)
- mov %edx, 8(%rdi)
-# ifdef USE_AS_STPCPY
- lea 11(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $12, %r8
- lea 12(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit13):
- mov (%rsi), %rcx
- mov 5(%rsi), %rdx
- mov %rcx, (%rdi)
- mov %rdx, 5(%rdi)
-# ifdef USE_AS_STPCPY
- lea 12(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $13, %r8
- lea 13(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit14):
- mov (%rsi), %rcx
- mov 6(%rsi), %rdx
- mov %rcx, (%rdi)
- mov %rdx, 6(%rdi)
-# ifdef USE_AS_STPCPY
- lea 13(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $14, %r8
- lea 14(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit15):
- mov (%rsi), %rcx
- mov 7(%rsi), %rdx
- mov %rcx, (%rdi)
- mov %rdx, 7(%rdi)
-# ifdef USE_AS_STPCPY
- lea 14(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $15, %r8
- lea 15(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit16):
- movdqu (%rsi), %xmm0
- movdqu %xmm0, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 15(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $16, %r8
- lea 16(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit17):
- movdqu (%rsi), %xmm0
- movdqu %xmm0, (%rdi)
- mov %dh, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 16(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $17, %r8
- lea 17(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit18):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %cx
- movdqu %xmm0, (%rdi)
- mov %cx, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 17(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $18, %r8
- lea 18(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit19):
- movdqu (%rsi), %xmm0
- mov 15(%rsi), %ecx
- movdqu %xmm0, (%rdi)
- mov %ecx, 15(%rdi)
-# ifdef USE_AS_STPCPY
- lea 18(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $19, %r8
- lea 19(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit20):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %ecx
- movdqu %xmm0, (%rdi)
- mov %ecx, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 19(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $20, %r8
- lea 20(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit21):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %ecx
- movdqu %xmm0, (%rdi)
- mov %ecx, 16(%rdi)
- mov %dh, 20(%rdi)
-# ifdef USE_AS_STPCPY
- lea 20(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $21, %r8
- lea 21(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit22):
- movdqu (%rsi), %xmm0
- mov 14(%rsi), %rcx
- movdqu %xmm0, (%rdi)
- mov %rcx, 14(%rdi)
-# ifdef USE_AS_STPCPY
- lea 21(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $22, %r8
- lea 22(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit23):
- movdqu (%rsi), %xmm0
- mov 15(%rsi), %rcx
- movdqu %xmm0, (%rdi)
- mov %rcx, 15(%rdi)
-# ifdef USE_AS_STPCPY
- lea 22(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $23, %r8
- lea 23(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit24):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rcx
- movdqu %xmm0, (%rdi)
- mov %rcx, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 23(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $24, %r8
- lea 24(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit25):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rcx
- movdqu %xmm0, (%rdi)
- mov %rcx, 16(%rdi)
- mov %dh, 24(%rdi)
-# ifdef USE_AS_STPCPY
- lea 24(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $25, %r8
- lea 25(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit26):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 24(%rsi), %cx
- movdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %cx, 24(%rdi)
-# ifdef USE_AS_STPCPY
- lea 25(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $26, %r8
- lea 26(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit27):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 23(%rsi), %ecx
- movdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %ecx, 23(%rdi)
-# ifdef USE_AS_STPCPY
- lea 26(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $27, %r8
- lea 27(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit28):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 24(%rsi), %ecx
- movdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %ecx, 24(%rdi)
-# ifdef USE_AS_STPCPY
- lea 27(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $28, %r8
- lea 28(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit29):
- movdqu (%rsi), %xmm0
- movdqu 13(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 13(%rdi)
-# ifdef USE_AS_STPCPY
- lea 28(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $29, %r8
- lea 29(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit30):
- movdqu (%rsi), %xmm0
- movdqu 14(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 14(%rdi)
-# ifdef USE_AS_STPCPY
- lea 29(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $30, %r8
- lea 30(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit31):
- movdqu (%rsi), %xmm0
- movdqu 15(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 15(%rdi)
-# ifdef USE_AS_STPCPY
- lea 30(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $31, %r8
- lea 31(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit32):
- movdqu (%rsi), %xmm0
- movdqu 16(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 31(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $32, %r8
- lea 32(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
-# ifdef USE_AS_STRNCPY
-
- .p2align 4
-L(StrncpyExit0):
-# ifdef USE_AS_STPCPY
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, (%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit1):
- mov (%rsi), %dl
- mov %dl, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 1(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 1(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit2):
- mov (%rsi), %dx
- mov %dx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 2(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 2(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit3):
- mov (%rsi), %cx
- mov 2(%rsi), %dl
- mov %cx, (%rdi)
- mov %dl, 2(%rdi)
-# ifdef USE_AS_STPCPY
- lea 3(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 3(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit4):
- mov (%rsi), %edx
- mov %edx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 4(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 4(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit5):
- mov (%rsi), %ecx
- mov 4(%rsi), %dl
- mov %ecx, (%rdi)
- mov %dl, 4(%rdi)
-# ifdef USE_AS_STPCPY
- lea 5(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 5(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit6):
- mov (%rsi), %ecx
- mov 4(%rsi), %dx
- mov %ecx, (%rdi)
- mov %dx, 4(%rdi)
-# ifdef USE_AS_STPCPY
- lea 6(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 6(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit7):
- mov (%rsi), %ecx
- mov 3(%rsi), %edx
- mov %ecx, (%rdi)
- mov %edx, 3(%rdi)
-# ifdef USE_AS_STPCPY
- lea 7(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 7(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit8):
- mov (%rsi), %rdx
- mov %rdx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 8(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 8(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit9):
- mov (%rsi), %rcx
- mov 8(%rsi), %dl
- mov %rcx, (%rdi)
- mov %dl, 8(%rdi)
-# ifdef USE_AS_STPCPY
- lea 9(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 9(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit10):
- mov (%rsi), %rcx
- mov 8(%rsi), %dx
- mov %rcx, (%rdi)
- mov %dx, 8(%rdi)
-# ifdef USE_AS_STPCPY
- lea 10(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 10(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit11):
- mov (%rsi), %rcx
- mov 7(%rsi), %edx
- mov %rcx, (%rdi)
- mov %edx, 7(%rdi)
-# ifdef USE_AS_STPCPY
- lea 11(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 11(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit12):
- mov (%rsi), %rcx
- mov 8(%rsi), %edx
- mov %rcx, (%rdi)
- mov %edx, 8(%rdi)
-# ifdef USE_AS_STPCPY
- lea 12(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 12(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit13):
- mov (%rsi), %rcx
- mov 5(%rsi), %rdx
- mov %rcx, (%rdi)
- mov %rdx, 5(%rdi)
-# ifdef USE_AS_STPCPY
- lea 13(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 13(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit14):
- mov (%rsi), %rcx
- mov 6(%rsi), %rdx
- mov %rcx, (%rdi)
- mov %rdx, 6(%rdi)
-# ifdef USE_AS_STPCPY
- lea 14(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 14(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit15):
- mov (%rsi), %rcx
- mov 7(%rsi), %rdx
- mov %rcx, (%rdi)
- mov %rdx, 7(%rdi)
-# ifdef USE_AS_STPCPY
- lea 15(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 15(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit16):
- movdqu (%rsi), %xmm0
- movdqu %xmm0, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 16(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 16(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit17):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %cl
- movdqu %xmm0, (%rdi)
- mov %cl, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 17(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 17(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit18):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %cx
- movdqu %xmm0, (%rdi)
- mov %cx, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 18(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 18(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit19):
- movdqu (%rsi), %xmm0
- mov 15(%rsi), %ecx
- movdqu %xmm0, (%rdi)
- mov %ecx, 15(%rdi)
-# ifdef USE_AS_STPCPY
- lea 19(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 19(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit20):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %ecx
- movdqu %xmm0, (%rdi)
- mov %ecx, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 20(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 20(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit21):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %ecx
- mov 20(%rsi), %dl
- movdqu %xmm0, (%rdi)
- mov %ecx, 16(%rdi)
- mov %dl, 20(%rdi)
-# ifdef USE_AS_STPCPY
- lea 21(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 21(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit22):
- movdqu (%rsi), %xmm0
- mov 14(%rsi), %rcx
- movdqu %xmm0, (%rdi)
- mov %rcx, 14(%rdi)
-# ifdef USE_AS_STPCPY
- lea 22(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 22(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit23):
- movdqu (%rsi), %xmm0
- mov 15(%rsi), %rcx
- movdqu %xmm0, (%rdi)
- mov %rcx, 15(%rdi)
-# ifdef USE_AS_STPCPY
- lea 23(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 23(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit24):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rcx
- movdqu %xmm0, (%rdi)
- mov %rcx, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 24(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 24(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit25):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 24(%rsi), %cl
- movdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %cl, 24(%rdi)
-# ifdef USE_AS_STPCPY
- lea 25(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 25(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit26):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 24(%rsi), %cx
- movdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %cx, 24(%rdi)
-# ifdef USE_AS_STPCPY
- lea 26(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 26(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit27):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 23(%rsi), %ecx
- movdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %ecx, 23(%rdi)
-# ifdef USE_AS_STPCPY
- lea 27(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 27(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit28):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 24(%rsi), %ecx
- movdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %ecx, 24(%rdi)
-# ifdef USE_AS_STPCPY
- lea 28(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 28(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit29):
- movdqu (%rsi), %xmm0
- movdqu 13(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 13(%rdi)
-# ifdef USE_AS_STPCPY
- lea 29(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 29(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit30):
- movdqu (%rsi), %xmm0
- movdqu 14(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 14(%rdi)
-# ifdef USE_AS_STPCPY
- lea 30(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 30(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit31):
- movdqu (%rsi), %xmm0
- movdqu 15(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 15(%rdi)
-# ifdef USE_AS_STPCPY
- lea 31(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 31(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit32):
- movdqu (%rsi), %xmm0
- movdqu 16(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 32(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 32(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit33):
- movdqu (%rsi), %xmm0
- movdqu 16(%rsi), %xmm2
- mov 32(%rsi), %cl
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 16(%rdi)
- mov %cl, 32(%rdi)
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 33(%rdi)
-# endif
- ret
-
-# ifndef USE_AS_STRCAT
-
- .p2align 4
-L(Fill0):
- ret
-
- .p2align 4
-L(Fill1):
- mov %dl, (%rdi)
- ret
-
- .p2align 4
-L(Fill2):
- mov %dx, (%rdi)
- ret
-
- .p2align 4
-L(Fill3):
- mov %edx, -1(%rdi)
- ret
-
- .p2align 4
-L(Fill4):
- mov %edx, (%rdi)
- ret
-
- .p2align 4
-L(Fill5):
- mov %edx, (%rdi)
- mov %dl, 4(%rdi)
- ret
-
- .p2align 4
-L(Fill6):
- mov %edx, (%rdi)
- mov %dx, 4(%rdi)
- ret
-
- .p2align 4
-L(Fill7):
- mov %rdx, -1(%rdi)
- ret
-
- .p2align 4
-L(Fill8):
- mov %rdx, (%rdi)
- ret
-
- .p2align 4
-L(Fill9):
- mov %rdx, (%rdi)
- mov %dl, 8(%rdi)
- ret
-
- .p2align 4
-L(Fill10):
- mov %rdx, (%rdi)
- mov %dx, 8(%rdi)
- ret
-
- .p2align 4
-L(Fill11):
- mov %rdx, (%rdi)
- mov %edx, 7(%rdi)
- ret
-
- .p2align 4
-L(Fill12):
- mov %rdx, (%rdi)
- mov %edx, 8(%rdi)
- ret
-
- .p2align 4
-L(Fill13):
- mov %rdx, (%rdi)
- mov %rdx, 5(%rdi)
- ret
-
- .p2align 4
-L(Fill14):
- mov %rdx, (%rdi)
- mov %rdx, 6(%rdi)
- ret
-
- .p2align 4
-L(Fill15):
- movdqu %xmm0, -1(%rdi)
- ret
-
- .p2align 4
-L(Fill16):
- movdqu %xmm0, (%rdi)
- ret
-
- .p2align 4
-L(CopyFrom1To16BytesUnalignedXmm2):
- movdqu %xmm2, (%rdi, %rcx)
-
- .p2align 4
-L(CopyFrom1To16BytesXmmExit):
- bsf %rdx, %rdx
- add $15, %r8
- add %rcx, %rdi
-# ifdef USE_AS_STPCPY
- lea (%rdi, %rdx), %rax
-# endif
- sub %rdx, %r8
- lea 1(%rdi, %rdx), %rdi
-
- .p2align 4
-L(StrncpyFillTailWithZero):
- pxor %xmm0, %xmm0
- xor %rdx, %rdx
- sub $16, %r8
- jbe L(StrncpyFillExit)
-
- movdqu %xmm0, (%rdi)
- add $16, %rdi
-
- mov %rdi, %rsi
- and $0xf, %rsi
- sub %rsi, %rdi
- add %rsi, %r8
- sub $64, %r8
- jb L(StrncpyFillLess64)
-
-L(StrncpyFillLoopMovdqa):
- movdqa %xmm0, (%rdi)
- movdqa %xmm0, 16(%rdi)
- movdqa %xmm0, 32(%rdi)
- movdqa %xmm0, 48(%rdi)
- add $64, %rdi
- sub $64, %r8
- jae L(StrncpyFillLoopMovdqa)
-
-L(StrncpyFillLess64):
- add $32, %r8
- jl L(StrncpyFillLess32)
- movdqa %xmm0, (%rdi)
- movdqa %xmm0, 16(%rdi)
- add $32, %rdi
- sub $16, %r8
- jl L(StrncpyFillExit)
- movdqa %xmm0, (%rdi)
- add $16, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
-
-L(StrncpyFillLess32):
- add $16, %r8
- jl L(StrncpyFillExit)
- movdqa %xmm0, (%rdi)
- add $16, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
-
-L(StrncpyFillExit):
- add $16, %r8
- BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
-
-/* end of ifndef USE_AS_STRCAT */
-# endif
-
- .p2align 4
-L(UnalignedLeaveCase2OrCase3):
- test %rdx, %rdx
- jnz L(Unaligned64LeaveCase2)
-L(Unaligned64LeaveCase3):
- lea 64(%r8), %rcx
- and $-16, %rcx
- add $48, %r8
- jl L(CopyFrom1To16BytesCase3)
- movdqu %xmm4, (%rdi)
- sub $16, %r8
- jb L(CopyFrom1To16BytesCase3)
- movdqu %xmm5, 16(%rdi)
- sub $16, %r8
- jb L(CopyFrom1To16BytesCase3)
- movdqu %xmm6, 32(%rdi)
- sub $16, %r8
- jb L(CopyFrom1To16BytesCase3)
- movdqu %xmm7, 48(%rdi)
-# ifdef USE_AS_STPCPY
- lea 64(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 64(%rdi)
-# endif
- ret
-
- .p2align 4
-L(Unaligned64LeaveCase2):
- xor %rcx, %rcx
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rdx
- add $48, %r8
- jle L(CopyFrom1To16BytesCase2OrCase3)
- test %rdx, %rdx
-# ifndef USE_AS_STRCAT
- jnz L(CopyFrom1To16BytesUnalignedXmm4)
-# else
- jnz L(CopyFrom1To16Bytes)
-# endif
- pcmpeqb %xmm5, %xmm0
- pmovmskb %xmm0, %rdx
- movdqu %xmm4, (%rdi)
- add $16, %rcx
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
- test %rdx, %rdx
-# ifndef USE_AS_STRCAT
- jnz L(CopyFrom1To16BytesUnalignedXmm5)
-# else
- jnz L(CopyFrom1To16Bytes)
-# endif
-
- pcmpeqb %xmm6, %xmm0
- pmovmskb %xmm0, %rdx
- movdqu %xmm5, 16(%rdi)
- add $16, %rcx
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
- test %rdx, %rdx
-# ifndef USE_AS_STRCAT
- jnz L(CopyFrom1To16BytesUnalignedXmm6)
-# else
- jnz L(CopyFrom1To16Bytes)
-# endif
-
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm0, %rdx
- movdqu %xmm6, 32(%rdi)
- lea 16(%rdi, %rcx), %rdi
- lea 16(%rsi, %rcx), %rsi
- bsf %rdx, %rdx
- cmp %r8, %rdx
- jb L(CopyFrom1To16BytesExit)
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
-
- .p2align 4
-L(ExitZero):
-# ifndef USE_AS_STRCAT
- mov %rdi, %rax
-# endif
- ret
-
-# endif
-
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# else
-END (STRCAT)
-# endif
- .p2align 4
- .section .rodata
-L(ExitTable):
- .int JMPTBL(L(Exit1), L(ExitTable))
- .int JMPTBL(L(Exit2), L(ExitTable))
- .int JMPTBL(L(Exit3), L(ExitTable))
- .int JMPTBL(L(Exit4), L(ExitTable))
- .int JMPTBL(L(Exit5), L(ExitTable))
- .int JMPTBL(L(Exit6), L(ExitTable))
- .int JMPTBL(L(Exit7), L(ExitTable))
- .int JMPTBL(L(Exit8), L(ExitTable))
- .int JMPTBL(L(Exit9), L(ExitTable))
- .int JMPTBL(L(Exit10), L(ExitTable))
- .int JMPTBL(L(Exit11), L(ExitTable))
- .int JMPTBL(L(Exit12), L(ExitTable))
- .int JMPTBL(L(Exit13), L(ExitTable))
- .int JMPTBL(L(Exit14), L(ExitTable))
- .int JMPTBL(L(Exit15), L(ExitTable))
- .int JMPTBL(L(Exit16), L(ExitTable))
- .int JMPTBL(L(Exit17), L(ExitTable))
- .int JMPTBL(L(Exit18), L(ExitTable))
- .int JMPTBL(L(Exit19), L(ExitTable))
- .int JMPTBL(L(Exit20), L(ExitTable))
- .int JMPTBL(L(Exit21), L(ExitTable))
- .int JMPTBL(L(Exit22), L(ExitTable))
- .int JMPTBL(L(Exit23), L(ExitTable))
- .int JMPTBL(L(Exit24), L(ExitTable))
- .int JMPTBL(L(Exit25), L(ExitTable))
- .int JMPTBL(L(Exit26), L(ExitTable))
- .int JMPTBL(L(Exit27), L(ExitTable))
- .int JMPTBL(L(Exit28), L(ExitTable))
- .int JMPTBL(L(Exit29), L(ExitTable))
- .int JMPTBL(L(Exit30), L(ExitTable))
- .int JMPTBL(L(Exit31), L(ExitTable))
- .int JMPTBL(L(Exit32), L(ExitTable))
-# ifdef USE_AS_STRNCPY
-L(ExitStrncpyTable):
- .int JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
-# ifndef USE_AS_STRCAT
- .p2align 4
-L(FillTable):
- .int JMPTBL(L(Fill0), L(FillTable))
- .int JMPTBL(L(Fill1), L(FillTable))
- .int JMPTBL(L(Fill2), L(FillTable))
- .int JMPTBL(L(Fill3), L(FillTable))
- .int JMPTBL(L(Fill4), L(FillTable))
- .int JMPTBL(L(Fill5), L(FillTable))
- .int JMPTBL(L(Fill6), L(FillTable))
- .int JMPTBL(L(Fill7), L(FillTable))
- .int JMPTBL(L(Fill8), L(FillTable))
- .int JMPTBL(L(Fill9), L(FillTable))
- .int JMPTBL(L(Fill10), L(FillTable))
- .int JMPTBL(L(Fill11), L(FillTable))
- .int JMPTBL(L(Fill12), L(FillTable))
- .int JMPTBL(L(Fill13), L(FillTable))
- .int JMPTBL(L(Fill14), L(FillTable))
- .int JMPTBL(L(Fill15), L(FillTable))
- .int JMPTBL(L(Fill16), L(FillTable))
-# endif
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
deleted file mode 100644
index 47aaeae671..0000000000
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ /dev/null
@@ -1,3551 +0,0 @@
-/* strcpy with SSSE3
- Copyright (C) 2011-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# ifndef USE_AS_STRCAT
-# include <sysdep.h>
-
-# ifndef STRCPY
-# define STRCPY __strcpy_ssse3
-# endif
-
- .section .text.ssse3,"ax",@progbits
-ENTRY (STRCPY)
-
- mov %rsi, %rcx
-# ifdef USE_AS_STRNCPY
- mov %rdx, %r8
-# endif
- mov %rdi, %rdx
-# ifdef USE_AS_STRNCPY
- test %r8, %r8
- jz L(Exit0)
- cmp $8, %r8
- jbe L(StrncpyExit8Bytes)
-# endif
- cmpb $0, (%rcx)
- jz L(Exit1)
- cmpb $0, 1(%rcx)
- jz L(Exit2)
- cmpb $0, 2(%rcx)
- jz L(Exit3)
- cmpb $0, 3(%rcx)
- jz L(Exit4)
- cmpb $0, 4(%rcx)
- jz L(Exit5)
- cmpb $0, 5(%rcx)
- jz L(Exit6)
- cmpb $0, 6(%rcx)
- jz L(Exit7)
- cmpb $0, 7(%rcx)
- jz L(Exit8)
-# ifdef USE_AS_STRNCPY
- cmp $16, %r8
- jb L(StrncpyExit15Bytes)
-# endif
- cmpb $0, 8(%rcx)
- jz L(Exit9)
- cmpb $0, 9(%rcx)
- jz L(Exit10)
- cmpb $0, 10(%rcx)
- jz L(Exit11)
- cmpb $0, 11(%rcx)
- jz L(Exit12)
- cmpb $0, 12(%rcx)
- jz L(Exit13)
- cmpb $0, 13(%rcx)
- jz L(Exit14)
- cmpb $0, 14(%rcx)
- jz L(Exit15)
-# ifdef USE_AS_STRNCPY
- cmp $16, %r8
- je L(Exit16)
-# endif
- cmpb $0, 15(%rcx)
- jz L(Exit16)
-# endif
-
-# ifdef USE_AS_STRNCPY
- mov %rcx, %rsi
- sub $16, %r8
- and $0xf, %rsi
-
-/* add 16 bytes rcx_offset to r8 */
-
- add %rsi, %r8
-# endif
- lea 16(%rcx), %rsi
- and $-16, %rsi
- pxor %xmm0, %xmm0
- mov (%rcx), %r9
- mov %r9, (%rdx)
- pcmpeqb (%rsi), %xmm0
- mov 8(%rcx), %r9
- mov %r9, 8(%rdx)
-
-/* convert byte mask in xmm0 to bit mask */
-
- pmovmskb %xmm0, %rax
- sub %rcx, %rsi
-
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- mov %rdx, %rax
- lea 16(%rdx), %rdx
- and $-16, %rdx
- sub %rdx, %rax
-
-# ifdef USE_AS_STRNCPY
- add %rax, %rsi
- lea -1(%rsi), %rsi
- and $1<<31, %esi
- test %rsi, %rsi
- jnz L(ContinueCopy)
- lea 16(%r8), %r8
-
-L(ContinueCopy):
-# endif
- sub %rax, %rcx
- mov %rcx, %rax
- and $0xf, %rax
- mov $0, %rsi
-
-/* case: rcx_offset == rdx_offset */
-
- jz L(Align16Both)
-
- cmp $8, %rax
- jae L(ShlHigh8)
- cmp $1, %rax
- je L(Shl1)
- cmp $2, %rax
- je L(Shl2)
- cmp $3, %rax
- je L(Shl3)
- cmp $4, %rax
- je L(Shl4)
- cmp $5, %rax
- je L(Shl5)
- cmp $6, %rax
- je L(Shl6)
- jmp L(Shl7)
-
-L(ShlHigh8):
- je L(Shl8)
- cmp $9, %rax
- je L(Shl9)
- cmp $10, %rax
- je L(Shl10)
- cmp $11, %rax
- je L(Shl11)
- cmp $12, %rax
- je L(Shl12)
- cmp $13, %rax
- je L(Shl13)
- cmp $14, %rax
- je L(Shl14)
- jmp L(Shl15)
-
-L(Align16Both):
- movaps (%rcx), %xmm1
- movaps 16(%rcx), %xmm2
- movaps %xmm1, (%rdx)
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm3
- movaps %xmm2, (%rdx, %rsi)
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm4
- movaps %xmm3, (%rdx, %rsi)
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm1
- movaps %xmm4, (%rdx, %rsi)
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm2
- movaps %xmm1, (%rdx, %rsi)
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm3
- movaps %xmm2, (%rdx, %rsi)
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps %xmm3, (%rdx, %rsi)
- mov %rcx, %rax
- lea 16(%rcx, %rsi), %rcx
- and $-0x40, %rcx
- sub %rcx, %rax
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- lea 112(%r8, %rax), %r8
-# endif
- mov $-0x40, %rsi
-
- .p2align 4
-L(Aligned64Loop):
- movaps (%rcx), %xmm2
- movaps %xmm2, %xmm4
- movaps 16(%rcx), %xmm5
- movaps 32(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 48(%rcx), %xmm7
- pminub %xmm5, %xmm2
- pminub %xmm7, %xmm3
- pminub %xmm2, %xmm3
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %rax
- lea 64(%rdx), %rdx
- lea 64(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeaveCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Aligned64Leave)
- movaps %xmm4, -64(%rdx)
- movaps %xmm5, -48(%rdx)
- movaps %xmm6, -32(%rdx)
- movaps %xmm7, -16(%rdx)
- jmp L(Aligned64Loop)
-
-L(Aligned64Leave):
-# ifdef USE_AS_STRNCPY
- lea 48(%r8), %r8
-# endif
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm5, %xmm0
-# ifdef USE_AS_STRNCPY
- lea -16(%r8), %r8
-# endif
- pmovmskb %xmm0, %rax
- movaps %xmm4, -64(%rdx)
- test %rax, %rax
- lea 16(%rsi), %rsi
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm6, %xmm0
-# ifdef USE_AS_STRNCPY
- lea -16(%r8), %r8
-# endif
- pmovmskb %xmm0, %rax
- movaps %xmm5, -48(%rdx)
- test %rax, %rax
- lea 16(%rsi), %rsi
- jnz L(CopyFrom1To16Bytes)
-
- movaps %xmm6, -32(%rdx)
- pcmpeqb %xmm7, %xmm0
-# ifdef USE_AS_STRNCPY
- lea -16(%r8), %r8
-# endif
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl1):
- movaps -1(%rcx), %xmm1
- movaps 15(%rcx), %xmm2
-L(Shl1Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 31(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -15(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -1(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl1LoopStart):
- movaps 15(%rcx), %xmm2
- movaps 31(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 47(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 63(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $1, %xmm4, %xmm5
- test %rax, %rax
- palignr $1, %xmm3, %xmm4
- jnz L(Shl1Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave1)
-# endif
- palignr $1, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $1, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl1LoopStart)
-
-L(Shl1LoopExit):
- movdqu -1(%rcx), %xmm1
- mov $15, %rsi
- movdqu %xmm1, -1(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl2):
- movaps -2(%rcx), %xmm1
- movaps 14(%rcx), %xmm2
-L(Shl2Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 30(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -14(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -2(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl2LoopStart):
- movaps 14(%rcx), %xmm2
- movaps 30(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 46(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 62(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $2, %xmm4, %xmm5
- test %rax, %rax
- palignr $2, %xmm3, %xmm4
- jnz L(Shl2Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave2)
-# endif
- palignr $2, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $2, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl2LoopStart)
-
-L(Shl2LoopExit):
- movdqu -2(%rcx), %xmm1
- mov $14, %rsi
- movdqu %xmm1, -2(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl3):
- movaps -3(%rcx), %xmm1
- movaps 13(%rcx), %xmm2
-L(Shl3Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 29(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -13(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -3(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl3LoopStart):
- movaps 13(%rcx), %xmm2
- movaps 29(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 45(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 61(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $3, %xmm4, %xmm5
- test %rax, %rax
- palignr $3, %xmm3, %xmm4
- jnz L(Shl3Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave3)
-# endif
- palignr $3, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $3, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl3LoopStart)
-
-L(Shl3LoopExit):
- movdqu -3(%rcx), %xmm1
- mov $13, %rsi
- movdqu %xmm1, -3(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl4):
- movaps -4(%rcx), %xmm1
- movaps 12(%rcx), %xmm2
-L(Shl4Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 28(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -12(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -4(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl4LoopStart):
- movaps 12(%rcx), %xmm2
- movaps 28(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 44(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 60(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $4, %xmm4, %xmm5
- test %rax, %rax
- palignr $4, %xmm3, %xmm4
- jnz L(Shl4Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave4)
-# endif
- palignr $4, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $4, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl4LoopStart)
-
-L(Shl4LoopExit):
- movdqu -4(%rcx), %xmm1
- mov $12, %rsi
- movdqu %xmm1, -4(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl5):
- movaps -5(%rcx), %xmm1
- movaps 11(%rcx), %xmm2
-L(Shl5Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 27(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -11(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -5(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl5LoopStart):
- movaps 11(%rcx), %xmm2
- movaps 27(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 43(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 59(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $5, %xmm4, %xmm5
- test %rax, %rax
- palignr $5, %xmm3, %xmm4
- jnz L(Shl5Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave5)
-# endif
- palignr $5, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $5, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl5LoopStart)
-
-L(Shl5LoopExit):
- movdqu -5(%rcx), %xmm1
- mov $11, %rsi
- movdqu %xmm1, -5(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl6):
- movaps -6(%rcx), %xmm1
- movaps 10(%rcx), %xmm2
-L(Shl6Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 26(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -10(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -6(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl6LoopStart):
- movaps 10(%rcx), %xmm2
- movaps 26(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 42(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 58(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $6, %xmm4, %xmm5
- test %rax, %rax
- palignr $6, %xmm3, %xmm4
- jnz L(Shl6Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave6)
-# endif
- palignr $6, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $6, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl6LoopStart)
-
-L(Shl6LoopExit):
- mov (%rcx), %r9
- mov 6(%rcx), %esi
- mov %r9, (%rdx)
- mov %esi, 6(%rdx)
- mov $10, %rsi
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl7):
- movaps -7(%rcx), %xmm1
- movaps 9(%rcx), %xmm2
-L(Shl7Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 25(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -9(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -7(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl7LoopStart):
- movaps 9(%rcx), %xmm2
- movaps 25(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 41(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 57(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $7, %xmm4, %xmm5
- test %rax, %rax
- palignr $7, %xmm3, %xmm4
- jnz L(Shl7Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave7)
-# endif
- palignr $7, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $7, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl7LoopStart)
-
-L(Shl7LoopExit):
- mov (%rcx), %r9
- mov 5(%rcx), %esi
- mov %r9, (%rdx)
- mov %esi, 5(%rdx)
- mov $9, %rsi
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl8):
- movaps -8(%rcx), %xmm1
- movaps 8(%rcx), %xmm2
-L(Shl8Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 24(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -8(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -8(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl8LoopStart):
- movaps 8(%rcx), %xmm2
- movaps 24(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 40(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 56(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $8, %xmm4, %xmm5
- test %rax, %rax
- palignr $8, %xmm3, %xmm4
- jnz L(Shl8Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave8)
-# endif
- palignr $8, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $8, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl8LoopStart)
-
-L(Shl8LoopExit):
- mov (%rcx), %r9
- mov $8, %rsi
- mov %r9, (%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl9):
- movaps -9(%rcx), %xmm1
- movaps 7(%rcx), %xmm2
-L(Shl9Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 23(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -7(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -9(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl9LoopStart):
- movaps 7(%rcx), %xmm2
- movaps 23(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 39(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 55(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $9, %xmm4, %xmm5
- test %rax, %rax
- palignr $9, %xmm3, %xmm4
- jnz L(Shl9Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave9)
-# endif
- palignr $9, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $9, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl9LoopStart)
-
-L(Shl9LoopExit):
- mov -1(%rcx), %r9
- mov $7, %rsi
- mov %r9, -1(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl10):
- movaps -10(%rcx), %xmm1
- movaps 6(%rcx), %xmm2
-L(Shl10Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 22(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -6(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -10(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl10LoopStart):
- movaps 6(%rcx), %xmm2
- movaps 22(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 38(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 54(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $10, %xmm4, %xmm5
- test %rax, %rax
- palignr $10, %xmm3, %xmm4
- jnz L(Shl10Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave10)
-# endif
- palignr $10, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $10, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl10LoopStart)
-
-L(Shl10LoopExit):
- mov -2(%rcx), %r9
- mov $6, %rsi
- mov %r9, -2(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl11):
- movaps -11(%rcx), %xmm1
- movaps 5(%rcx), %xmm2
-L(Shl11Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 21(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -5(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -11(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl11LoopStart):
- movaps 5(%rcx), %xmm2
- movaps 21(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 37(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 53(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $11, %xmm4, %xmm5
- test %rax, %rax
- palignr $11, %xmm3, %xmm4
- jnz L(Shl11Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave11)
-# endif
- palignr $11, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $11, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl11LoopStart)
-
-L(Shl11LoopExit):
- mov -3(%rcx), %r9
- mov $5, %rsi
- mov %r9, -3(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl12):
- movaps -12(%rcx), %xmm1
- movaps 4(%rcx), %xmm2
-L(Shl12Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 20(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -4(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -12(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl12LoopStart):
- movaps 4(%rcx), %xmm2
- movaps 20(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 36(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 52(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $12, %xmm4, %xmm5
- test %rax, %rax
- palignr $12, %xmm3, %xmm4
- jnz L(Shl12Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave12)
-# endif
- palignr $12, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $12, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl12LoopStart)
-
-L(Shl12LoopExit):
- mov (%rcx), %r9d
- mov $4, %rsi
- mov %r9d, (%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl13):
- movaps -13(%rcx), %xmm1
- movaps 3(%rcx), %xmm2
-L(Shl13Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 19(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -3(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -13(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl13LoopStart):
- movaps 3(%rcx), %xmm2
- movaps 19(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 35(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 51(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $13, %xmm4, %xmm5
- test %rax, %rax
- palignr $13, %xmm3, %xmm4
- jnz L(Shl13Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave13)
-# endif
- palignr $13, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $13, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl13LoopStart)
-
-L(Shl13LoopExit):
- mov -1(%rcx), %r9d
- mov $3, %rsi
- mov %r9d, -1(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl14):
- movaps -14(%rcx), %xmm1
- movaps 2(%rcx), %xmm2
-L(Shl14Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 18(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -2(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -14(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl14LoopStart):
- movaps 2(%rcx), %xmm2
- movaps 18(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 34(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 50(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $14, %xmm4, %xmm5
- test %rax, %rax
- palignr $14, %xmm3, %xmm4
- jnz L(Shl14Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave14)
-# endif
- palignr $14, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $14, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl14LoopStart)
-
-L(Shl14LoopExit):
- mov -2(%rcx), %r9d
- mov $2, %rsi
- mov %r9d, -2(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl15):
- movaps -15(%rcx), %xmm1
- movaps 1(%rcx), %xmm2
-L(Shl15Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 17(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -1(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -15(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl15LoopStart):
- movaps 1(%rcx), %xmm2
- movaps 17(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 33(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 49(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $15, %xmm4, %xmm5
- test %rax, %rax
- palignr $15, %xmm3, %xmm4
- jnz L(Shl15Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave15)
-# endif
- palignr $15, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $15, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl15LoopStart)
-
-L(Shl15LoopExit):
- mov -3(%rcx), %r9d
- mov $1, %rsi
- mov %r9d, -3(%rdx)
-# ifdef USE_AS_STRCAT
- jmp L(CopyFrom1To16Bytes)
-# endif
-
-# ifndef USE_AS_STRCAT
-
- .p2align 4
-L(CopyFrom1To16Bytes):
-# ifdef USE_AS_STRNCPY
- add $16, %r8
-# endif
- add %rsi, %rdx
- add %rsi, %rcx
-
- test %al, %al
- jz L(ExitHigh)
- test $0x01, %al
- jnz L(Exit1)
- test $0x02, %al
- jnz L(Exit2)
- test $0x04, %al
- jnz L(Exit3)
- test $0x08, %al
- jnz L(Exit4)
- test $0x10, %al
- jnz L(Exit5)
- test $0x20, %al
- jnz L(Exit6)
- test $0x40, %al
- jnz L(Exit7)
-
- .p2align 4
-L(Exit8):
- mov (%rcx), %rax
- mov %rax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 7(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $8, %r8
- lea 8(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(ExitHigh):
- test $0x01, %ah
- jnz L(Exit9)
- test $0x02, %ah
- jnz L(Exit10)
- test $0x04, %ah
- jnz L(Exit11)
- test $0x08, %ah
- jnz L(Exit12)
- test $0x10, %ah
- jnz L(Exit13)
- test $0x20, %ah
- jnz L(Exit14)
- test $0x40, %ah
- jnz L(Exit15)
-
- .p2align 4
-L(Exit16):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 8(%rcx), %rax
- mov %rax, 8(%rdx)
-# ifdef USE_AS_STPCPY
- lea 15(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- lea 16(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
-# ifdef USE_AS_STRNCPY
-
- .p2align 4
-L(CopyFrom1To16BytesCase2):
- add $16, %r8
- add %rsi, %rcx
- lea (%rsi, %rdx), %rsi
- lea -9(%r8), %rdx
- and $1<<7, %dh
- or %al, %dh
- test %dh, %dh
- lea (%rsi), %rdx
- jz L(ExitHighCase2)
-
- cmp $1, %r8
- je L(Exit1)
- test $0x01, %al
- jnz L(Exit1)
- cmp $2, %r8
- je L(Exit2)
- test $0x02, %al
- jnz L(Exit2)
- cmp $3, %r8
- je L(Exit3)
- test $0x04, %al
- jnz L(Exit3)
- cmp $4, %r8
- je L(Exit4)
- test $0x08, %al
- jnz L(Exit4)
- cmp $5, %r8
- je L(Exit5)
- test $0x10, %al
- jnz L(Exit5)
- cmp $6, %r8
- je L(Exit6)
- test $0x20, %al
- jnz L(Exit6)
- cmp $7, %r8
- je L(Exit7)
- test $0x40, %al
- jnz L(Exit7)
- jmp L(Exit8)
-
- .p2align 4
-L(ExitHighCase2):
- cmp $9, %r8
- je L(Exit9)
- test $0x01, %ah
- jnz L(Exit9)
- cmp $10, %r8
- je L(Exit10)
- test $0x02, %ah
- jnz L(Exit10)
- cmp $11, %r8
- je L(Exit11)
- test $0x04, %ah
- jnz L(Exit11)
- cmp $12, %r8
- je L(Exit12)
- test $0x8, %ah
- jnz L(Exit12)
- cmp $13, %r8
- je L(Exit13)
- test $0x10, %ah
- jnz L(Exit13)
- cmp $14, %r8
- je L(Exit14)
- test $0x20, %ah
- jnz L(Exit14)
- cmp $15, %r8
- je L(Exit15)
- test $0x40, %ah
- jnz L(Exit15)
- jmp L(Exit16)
-
-L(CopyFrom1To16BytesCase2OrCase3):
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
-
- .p2align 4
-L(CopyFrom1To16BytesCase3):
- add $16, %r8
- add %rsi, %rdx
- add %rsi, %rcx
-
- cmp $16, %r8
- je L(Exit16)
- cmp $8, %r8
- je L(Exit8)
- jg L(More8Case3)
- cmp $4, %r8
- je L(Exit4)
- jg L(More4Case3)
- cmp $2, %r8
- jl L(Exit1)
- je L(Exit2)
- jg L(Exit3)
-L(More8Case3): /* but less than 16 */
- cmp $12, %r8
- je L(Exit12)
- jl L(Less12Case3)
- cmp $14, %r8
- jl L(Exit13)
- je L(Exit14)
- jg L(Exit15)
-L(More4Case3): /* but less than 8 */
- cmp $6, %r8
- jl L(Exit5)
- je L(Exit6)
- jg L(Exit7)
-L(Less12Case3): /* but more than 8 */
- cmp $10, %r8
- jl L(Exit9)
- je L(Exit10)
- jg L(Exit11)
-# endif
-
- .p2align 4
-L(Exit1):
- movb (%rcx), %al
- movb %al, (%rdx)
-# ifdef USE_AS_STPCPY
- lea (%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $1, %r8
- lea 1(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit2):
- movw (%rcx), %ax
- movw %ax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 1(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $2, %r8
- lea 2(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit3):
- movw (%rcx), %ax
- movw %ax, (%rdx)
- movb 2(%rcx), %al
- movb %al, 2(%rdx)
-# ifdef USE_AS_STPCPY
- lea 2(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $3, %r8
- lea 3(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit4):
- movl (%rcx), %eax
- movl %eax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 3(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $4, %r8
- lea 4(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit5):
- movl (%rcx), %eax
- movl %eax, (%rdx)
- movb 4(%rcx), %al
- movb %al, 4(%rdx)
-# ifdef USE_AS_STPCPY
- lea 4(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $5, %r8
- lea 5(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit6):
- movl (%rcx), %eax
- movl %eax, (%rdx)
- movw 4(%rcx), %ax
- movw %ax, 4(%rdx)
-# ifdef USE_AS_STPCPY
- lea 5(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $6, %r8
- lea 6(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit7):
- movl (%rcx), %eax
- movl %eax, (%rdx)
- movl 3(%rcx), %eax
- movl %eax, 3(%rdx)
-# ifdef USE_AS_STPCPY
- lea 6(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $7, %r8
- lea 7(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit9):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 5(%rcx), %eax
- mov %eax, 5(%rdx)
-# ifdef USE_AS_STPCPY
- lea 8(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $9, %r8
- lea 9(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit10):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 6(%rcx), %eax
- mov %eax, 6(%rdx)
-# ifdef USE_AS_STPCPY
- lea 9(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $10, %r8
- lea 10(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit11):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 7(%rcx), %eax
- mov %eax, 7(%rdx)
-# ifdef USE_AS_STPCPY
- lea 10(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $11, %r8
- lea 11(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit12):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 8(%rcx), %eax
- mov %eax, 8(%rdx)
-# ifdef USE_AS_STPCPY
- lea 11(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $12, %r8
- lea 12(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit13):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 5(%rcx), %rax
- mov %rax, 5(%rdx)
-# ifdef USE_AS_STPCPY
- lea 12(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $13, %r8
- lea 13(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit14):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 6(%rcx), %rax
- mov %rax, 6(%rdx)
-# ifdef USE_AS_STPCPY
- lea 13(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $14, %r8
- lea 14(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit15):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 7(%rcx), %rax
- mov %rax, 7(%rdx)
-# ifdef USE_AS_STPCPY
- lea 14(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $15, %r8
- lea 15(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
-# ifdef USE_AS_STRNCPY
- .p2align 4
-L(Fill0):
- ret
-
- .p2align 4
-L(Fill1):
- movb %dl, (%rcx)
- ret
-
- .p2align 4
-L(Fill2):
- movw %dx, (%rcx)
- ret
-
- .p2align 4
-L(Fill3):
- movw %dx, (%rcx)
- movb %dl, 2(%rcx)
- ret
-
- .p2align 4
-L(Fill4):
- movl %edx, (%rcx)
- ret
-
- .p2align 4
-L(Fill5):
- movl %edx, (%rcx)
- movb %dl, 4(%rcx)
- ret
-
- .p2align 4
-L(Fill6):
- movl %edx, (%rcx)
- movw %dx, 4(%rcx)
- ret
-
- .p2align 4
-L(Fill7):
- movl %edx, (%rcx)
- movl %edx, 3(%rcx)
- ret
-
- .p2align 4
-L(Fill8):
- mov %rdx, (%rcx)
- ret
-
- .p2align 4
-L(Fill9):
- mov %rdx, (%rcx)
- movb %dl, 8(%rcx)
- ret
-
- .p2align 4
-L(Fill10):
- mov %rdx, (%rcx)
- movw %dx, 8(%rcx)
- ret
-
- .p2align 4
-L(Fill11):
- mov %rdx, (%rcx)
- movl %edx, 7(%rcx)
- ret
-
- .p2align 4
-L(Fill12):
- mov %rdx, (%rcx)
- movl %edx, 8(%rcx)
- ret
-
- .p2align 4
-L(Fill13):
- mov %rdx, (%rcx)
- mov %rdx, 5(%rcx)
- ret
-
- .p2align 4
-L(Fill14):
- mov %rdx, (%rcx)
- mov %rdx, 6(%rcx)
- ret
-
- .p2align 4
-L(Fill15):
- mov %rdx, (%rcx)
- mov %rdx, 7(%rcx)
- ret
-
- .p2align 4
-L(Fill16):
- mov %rdx, (%rcx)
- mov %rdx, 8(%rcx)
- ret
-
- .p2align 4
-L(StrncpyFillExit1):
- lea 16(%r8), %r8
-L(FillFrom1To16Bytes):
- test %r8, %r8
- jz L(Fill0)
- cmp $16, %r8
- je L(Fill16)
- cmp $8, %r8
- je L(Fill8)
- jg L(FillMore8)
- cmp $4, %r8
- je L(Fill4)
- jg L(FillMore4)
- cmp $2, %r8
- jl L(Fill1)
- je L(Fill2)
- jg L(Fill3)
-L(FillMore8): /* but less than 16 */
- cmp $12, %r8
- je L(Fill12)
- jl L(FillLess12)
- cmp $14, %r8
- jl L(Fill13)
- je L(Fill14)
- jg L(Fill15)
-L(FillMore4): /* but less than 8 */
- cmp $6, %r8
- jl L(Fill5)
- je L(Fill6)
- jg L(Fill7)
-L(FillLess12): /* but more than 8 */
- cmp $10, %r8
- jl L(Fill9)
- je L(Fill10)
- jmp L(Fill11)
-
- .p2align 4
-L(StrncpyFillTailWithZero1):
- xor %rdx, %rdx
- sub $16, %r8
- jbe L(StrncpyFillExit1)
-
- pxor %xmm0, %xmm0
- mov %rdx, (%rcx)
- mov %rdx, 8(%rcx)
-
- lea 16(%rcx), %rcx
-
- mov %rcx, %rdx
- and $0xf, %rdx
- sub %rdx, %rcx
- add %rdx, %r8
- xor %rdx, %rdx
- sub $64, %r8
- jb L(StrncpyFillLess64)
-
-L(StrncpyFillLoopMovdqa):
- movdqa %xmm0, (%rcx)
- movdqa %xmm0, 16(%rcx)
- movdqa %xmm0, 32(%rcx)
- movdqa %xmm0, 48(%rcx)
- lea 64(%rcx), %rcx
- sub $64, %r8
- jae L(StrncpyFillLoopMovdqa)
-
-L(StrncpyFillLess64):
- add $32, %r8
- jl L(StrncpyFillLess32)
- movdqa %xmm0, (%rcx)
- movdqa %xmm0, 16(%rcx)
- lea 32(%rcx), %rcx
- sub $16, %r8
- jl L(StrncpyFillExit1)
- movdqa %xmm0, (%rcx)
- lea 16(%rcx), %rcx
- jmp L(FillFrom1To16Bytes)
-
-L(StrncpyFillLess32):
- add $16, %r8
- jl L(StrncpyFillExit1)
- movdqa %xmm0, (%rcx)
- lea 16(%rcx), %rcx
- jmp L(FillFrom1To16Bytes)
-
- .p2align 4
-L(Exit0):
- mov %rdx, %rax
- ret
-
- .p2align 4
-L(StrncpyExit15Bytes):
- cmp $9, %r8
- je L(Exit9)
- cmpb $0, 8(%rcx)
- jz L(Exit9)
- cmp $10, %r8
- je L(Exit10)
- cmpb $0, 9(%rcx)
- jz L(Exit10)
- cmp $11, %r8
- je L(Exit11)
- cmpb $0, 10(%rcx)
- jz L(Exit11)
- cmp $12, %r8
- je L(Exit12)
- cmpb $0, 11(%rcx)
- jz L(Exit12)
- cmp $13, %r8
- je L(Exit13)
- cmpb $0, 12(%rcx)
- jz L(Exit13)
- cmp $14, %r8
- je L(Exit14)
- cmpb $0, 13(%rcx)
- jz L(Exit14)
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 7(%rcx), %rax
- mov %rax, 7(%rdx)
-# ifdef USE_AS_STPCPY
- lea 14(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
-# else
- mov %rdi, %rax
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit8Bytes):
- cmp $1, %r8
- je L(Exit1)
- cmpb $0, (%rcx)
- jz L(Exit1)
- cmp $2, %r8
- je L(Exit2)
- cmpb $0, 1(%rcx)
- jz L(Exit2)
- cmp $3, %r8
- je L(Exit3)
- cmpb $0, 2(%rcx)
- jz L(Exit3)
- cmp $4, %r8
- je L(Exit4)
- cmpb $0, 3(%rcx)
- jz L(Exit4)
- cmp $5, %r8
- je L(Exit5)
- cmpb $0, 4(%rcx)
- jz L(Exit5)
- cmp $6, %r8
- je L(Exit6)
- cmpb $0, 5(%rcx)
- jz L(Exit6)
- cmp $7, %r8
- je L(Exit7)
- cmpb $0, 6(%rcx)
- jz L(Exit7)
- mov (%rcx), %rax
- mov %rax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 7(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
-# else
- mov %rdi, %rax
-# endif
- ret
-
-# endif
-# endif
-
-# ifdef USE_AS_STRNCPY
- .p2align 4
-L(StrncpyLeaveCase2OrCase3):
- test %rax, %rax
- jnz L(Aligned64LeaveCase2)
-
-L(Aligned64LeaveCase3):
- lea 64(%r8), %r8
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase3)
- movaps %xmm4, -64(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase3)
- movaps %xmm5, -48(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase3)
- movaps %xmm6, -32(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
- jmp L(CopyFrom1To16BytesCase3)
-
-L(Aligned64LeaveCase2):
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- add $48, %r8
- jle L(CopyFrom1To16BytesCase2OrCase3)
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm5, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm4, -64(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm6, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm5, -48(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm6, -32(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
- jmp L(CopyFrom1To16BytesCase2)
-/*--------------------------------------------------*/
- .p2align 4
-L(StrncpyExit1Case2OrCase3):
- movdqu -1(%rcx), %xmm0
- movdqu %xmm0, -1(%rdx)
- mov $15, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit2Case2OrCase3):
- movdqu -2(%rcx), %xmm0
- movdqu %xmm0, -2(%rdx)
- mov $14, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit3Case2OrCase3):
- movdqu -3(%rcx), %xmm0
- movdqu %xmm0, -3(%rdx)
- mov $13, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit4Case2OrCase3):
- movdqu -4(%rcx), %xmm0
- movdqu %xmm0, -4(%rdx)
- mov $12, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit5Case2OrCase3):
- movdqu -5(%rcx), %xmm0
- movdqu %xmm0, -5(%rdx)
- mov $11, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit6Case2OrCase3):
- mov (%rcx), %rsi
- mov 6(%rcx), %r9d
- mov %r9d, 6(%rdx)
- mov %rsi, (%rdx)
- test %rax, %rax
- mov $10, %rsi
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit7Case2OrCase3):
- mov (%rcx), %rsi
- mov 5(%rcx), %r9d
- mov %r9d, 5(%rdx)
- mov %rsi, (%rdx)
- test %rax, %rax
- mov $9, %rsi
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit8Case2OrCase3):
- mov (%rcx), %r9
- mov $8, %rsi
- mov %r9, (%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit9Case2OrCase3):
- mov -1(%rcx), %r9
- mov $7, %rsi
- mov %r9, -1(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit10Case2OrCase3):
- mov -2(%rcx), %r9
- mov $6, %rsi
- mov %r9, -2(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit11Case2OrCase3):
- mov -3(%rcx), %r9
- mov $5, %rsi
- mov %r9, -3(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit12Case2OrCase3):
- mov (%rcx), %r9d
- mov $4, %rsi
- mov %r9d, (%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit13Case2OrCase3):
- mov -1(%rcx), %r9d
- mov $3, %rsi
- mov %r9d, -1(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit14Case2OrCase3):
- mov -2(%rcx), %r9d
- mov $2, %rsi
- mov %r9d, -2(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit15Case2OrCase3):
- mov -3(%rcx), %r9d
- mov $1, %rsi
- mov %r9d, -3(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave1):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit1)
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit1)
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit1)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit1)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit1):
- lea 15(%rdx, %rsi), %rdx
- lea 15(%rcx, %rsi), %rcx
- mov -15(%rcx), %rsi
- mov -8(%rcx), %rax
- mov %rsi, -15(%rdx)
- mov %rax, -8(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave2):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit2)
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit2)
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit2)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit2)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit2):
- lea 14(%rdx, %rsi), %rdx
- lea 14(%rcx, %rsi), %rcx
- mov -14(%rcx), %rsi
- mov -8(%rcx), %rax
- mov %rsi, -14(%rdx)
- mov %rax, -8(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave3):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit3)
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit3)
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit3)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit3)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit3):
- lea 13(%rdx, %rsi), %rdx
- lea 13(%rcx, %rsi), %rcx
- mov -13(%rcx), %rsi
- mov -8(%rcx), %rax
- mov %rsi, -13(%rdx)
- mov %rax, -8(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave4):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit4)
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit4)
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit4)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit4)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit4):
- lea 12(%rdx, %rsi), %rdx
- lea 12(%rcx, %rsi), %rcx
- mov -12(%rcx), %rsi
- mov -4(%rcx), %eax
- mov %rsi, -12(%rdx)
- mov %eax, -4(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave5):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit5)
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit5)
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit5)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit5)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit5):
- lea 11(%rdx, %rsi), %rdx
- lea 11(%rcx, %rsi), %rcx
- mov -11(%rcx), %rsi
- mov -4(%rcx), %eax
- mov %rsi, -11(%rdx)
- mov %eax, -4(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave6):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit6)
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit6)
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit6)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit6)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit6):
- lea 10(%rdx, %rsi), %rdx
- lea 10(%rcx, %rsi), %rcx
- mov -10(%rcx), %rsi
- movw -2(%rcx), %ax
- mov %rsi, -10(%rdx)
- movw %ax, -2(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave7):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit7)
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit7)
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit7)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit7)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit7):
- lea 9(%rdx, %rsi), %rdx
- lea 9(%rcx, %rsi), %rcx
- mov -9(%rcx), %rsi
- movb -1(%rcx), %ah
- mov %rsi, -9(%rdx)
- movb %ah, -1(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave8):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit8)
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit8)
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit8)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit8)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit8):
- lea 8(%rdx, %rsi), %rdx
- lea 8(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave9):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit9)
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit9)
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit9)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit9)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit9):
- lea 7(%rdx, %rsi), %rdx
- lea 7(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave10):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit10)
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit10)
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit10)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit10)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit10):
- lea 6(%rdx, %rsi), %rdx
- lea 6(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave11):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit11)
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit11)
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit11)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit11)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit11):
- lea 5(%rdx, %rsi), %rdx
- lea 5(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave12):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit12)
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit12)
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit12)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit12)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit12):
- lea 4(%rdx, %rsi), %rdx
- lea 4(%rcx, %rsi), %rcx
- mov -4(%rcx), %eax
- xor %rsi, %rsi
- mov %eax, -4(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave13):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit13)
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit13)
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit13)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit13)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit13):
- lea 3(%rdx, %rsi), %rdx
- lea 3(%rcx, %rsi), %rcx
- mov -4(%rcx), %eax
- xor %rsi, %rsi
- mov %eax, -4(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave14):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit14)
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit14)
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit14)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit14)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit14):
- lea 2(%rdx, %rsi), %rdx
- lea 2(%rcx, %rsi), %rcx
- movw -2(%rcx), %ax
- xor %rsi, %rsi
- movw %ax, -2(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave15):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit15)
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit15)
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit15)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit15)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit15):
- lea 1(%rdx, %rsi), %rdx
- lea 1(%rcx, %rsi), %rcx
- movb -1(%rcx), %ah
- xor %rsi, %rsi
- movb %ah, -1(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
-# endif
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S
deleted file mode 100644
index 77819ddc50..0000000000
--- a/sysdeps/x86_64/multiarch/strcpy.S
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Multiple versions of strcpy
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2009-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY)
-# ifndef STRCPY
-# define STRCPY strcpy
-# endif
-#endif
-
-#ifdef USE_AS_STPCPY
-# ifdef USE_AS_STRNCPY
-# define STRCPY_SSSE3 __stpncpy_ssse3
-# define STRCPY_SSE2 __stpncpy_sse2
-# define STRCPY_SSE2_UNALIGNED __stpncpy_sse2_unaligned
-# define __GI_STRCPY __GI_stpncpy
-# define __GI___STRCPY __GI___stpncpy
-# else
-# define STRCPY_SSSE3 __stpcpy_ssse3
-# define STRCPY_SSE2 __stpcpy_sse2
-# define STRCPY_SSE2_UNALIGNED __stpcpy_sse2_unaligned
-# define __GI_STRCPY __GI_stpcpy
-# define __GI___STRCPY __GI___stpcpy
-# endif
-#else
-# ifdef USE_AS_STRNCPY
-# define STRCPY_SSSE3 __strncpy_ssse3
-# define STRCPY_SSE2 __strncpy_sse2
-# define STRCPY_SSE2_UNALIGNED __strncpy_sse2_unaligned
-# define __GI_STRCPY __GI_strncpy
-# else
-# define STRCPY_SSSE3 __strcpy_ssse3
-# define STRCPY_SSE2 __strcpy_sse2
-# define STRCPY_SSE2_UNALIGNED __strcpy_sse2_unaligned
-# define __GI_STRCPY __GI_strcpy
-# endif
-#endif
-
-
-/* Define multiple versions only for the definition in libc. */
-#if IS_IN (libc)
- .text
-ENTRY(STRCPY)
- .type STRCPY, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- leaq STRCPY_SSE2_UNALIGNED(%rip), %rax
- HAS_ARCH_FEATURE (Fast_Unaligned_Load)
- jnz 2f
- leaq STRCPY_SSE2(%rip), %rax
- HAS_CPU_FEATURE (SSSE3)
- jz 2f
- leaq STRCPY_SSSE3(%rip), %rax
-2: ret
-END(STRCPY)
-
-# undef ENTRY
-# define ENTRY(name) \
- .type STRCPY_SSE2, @function; \
- .align 16; \
- .globl STRCPY_SSE2; \
- .hidden STRCPY_SSE2; \
- STRCPY_SSE2: cfi_startproc; \
- CALL_MCOUNT
-# undef END
-# define END(name) \
- cfi_endproc; .size STRCPY_SSE2, .-STRCPY_SSE2
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal strcpy calls through a PLT.
- The speedup we get from using SSSE3 instruction is likely eaten away
- by the indirect call in the PLT. */
-# define libc_hidden_builtin_def(name) \
- .globl __GI_STRCPY; __GI_STRCPY = STRCPY_SSE2
-# undef libc_hidden_def
-# define libc_hidden_def(name) \
- .globl __GI___STRCPY; __GI___STRCPY = STRCPY_SSE2
-#endif
-
-#ifndef USE_AS_STRNCPY
-#include "../strcpy.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
deleted file mode 100644
index 67991b5ca7..0000000000
--- a/sysdeps/x86_64/multiarch/strcspn-c.c
+++ /dev/null
@@ -1,173 +0,0 @@
-/* strcspn with SSE4.2 intrinsics
- Copyright (C) 2009-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <nmmintrin.h>
-#include <string.h>
-#include "varshift.h"
-
-/* We use 0x2:
- _SIDD_SBYTE_OPS
- | _SIDD_CMP_EQUAL_ANY
- | _SIDD_POSITIVE_POLARITY
- | _SIDD_LEAST_SIGNIFICANT
- on pcmpistri to compare xmm/mem128
-
- 0 1 2 3 4 5 6 7 8 9 A B C D E F
- X X X X X X X X X X X X X X X X
-
- against xmm
-
- 0 1 2 3 4 5 6 7 8 9 A B C D E F
- A A A A A A A A A A A A A A A A
-
- to find out if the first 16byte data element has any byte A and
- the offset of the first byte. There are 3 cases:
-
- 1. The first 16byte data element has the byte A at the offset X.
- 2. The first 16byte data element has EOS and doesn't have the byte A.
- 3. The first 16byte data element is valid and doesn't have the byte A.
-
- Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
-
- 1 X 1 0/1 0
- 2 16 0 1 0
- 3 16 0 0 0
-
- We exit from the loop for cases 1 and 2 with jbe which branches
- when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset
- X for case 1. */
-
-#ifndef STRCSPN_SSE2
-# define STRCSPN_SSE2 __strcspn_sse2
-# define STRCSPN_SSE42 __strcspn_sse42
-#endif
-
-#ifdef USE_AS_STRPBRK
-# define RETURN(val1, val2) return val1
-#else
-# define RETURN(val1, val2) return val2
-#endif
-
-extern
-#ifdef USE_AS_STRPBRK
-char *
-#else
-size_t
-#endif
-STRCSPN_SSE2 (const char *, const char *);
-
-
-#ifdef USE_AS_STRPBRK
-char *
-#else
-size_t
-#endif
-__attribute__ ((section (".text.sse4.2")))
-STRCSPN_SSE42 (const char *s, const char *a)
-{
- if (*a == 0)
- RETURN (NULL, strlen (s));
-
- const char *aligned;
- __m128i mask;
- int offset = (int) ((size_t) a & 15);
- if (offset != 0)
- {
- /* Load masks. */
- aligned = (const char *) ((size_t) a & -16L);
- __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
-
- mask = __m128i_shift_right (mask0, offset);
-
- /* Find where the NULL terminator is. */
- int length = _mm_cmpistri (mask, mask, 0x3a);
- if (length == 16 - offset)
- {
- /* There is no NULL terminator. */
- __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
- int index = _mm_cmpistri (mask1, mask1, 0x3a);
- length += index;
-
- /* Don't use SSE4.2 if the length of A > 16. */
- if (length > 16)
- return STRCSPN_SSE2 (s, a);
-
- if (index != 0)
- {
- /* Combine mask0 and mask1. We could play games with
- palignr, but frankly this data should be in L1 now
- so do the merge via an unaligned load. */
- mask = _mm_loadu_si128 ((__m128i *) a);
- }
- }
- }
- else
- {
- /* A is aligned. */
- mask = _mm_load_si128 ((__m128i *) a);
-
- /* Find where the NULL terminator is. */
- int length = _mm_cmpistri (mask, mask, 0x3a);
- if (length == 16)
- {
- /* There is no NULL terminator. Don't use SSE4.2 if the length
- of A > 16. */
- if (a[16] != 0)
- return STRCSPN_SSE2 (s, a);
- }
- }
-
- offset = (int) ((size_t) s & 15);
- if (offset != 0)
- {
- /* Check partial string. */
- aligned = (const char *) ((size_t) s & -16L);
- __m128i value = _mm_load_si128 ((__m128i *) aligned);
-
- value = __m128i_shift_right (value, offset);
-
- int length = _mm_cmpistri (mask, value, 0x2);
- /* No need to check ZFlag since ZFlag is always 1. */
- int cflag = _mm_cmpistrc (mask, value, 0x2);
- if (cflag)
- RETURN ((char *) (s + length), length);
- /* Find where the NULL terminator is. */
- int index = _mm_cmpistri (value, value, 0x3a);
- if (index < 16 - offset)
- RETURN (NULL, index);
- aligned += 16;
- }
- else
- aligned = s;
-
- while (1)
- {
- __m128i value = _mm_load_si128 ((__m128i *) aligned);
- int index = _mm_cmpistri (mask, value, 0x2);
- int cflag = _mm_cmpistrc (mask, value, 0x2);
- int zflag = _mm_cmpistrz (mask, value, 0x2);
- if (cflag)
- RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
- if (zflag)
- RETURN (NULL,
- /* Find where the NULL terminator is. */
- (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s));
- aligned += 16;
- }
-}
diff --git a/sysdeps/x86_64/multiarch/strcspn.S b/sysdeps/x86_64/multiarch/strcspn.S
deleted file mode 100644
index d102c7e80b..0000000000
--- a/sysdeps/x86_64/multiarch/strcspn.S
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Multiple versions of strcspn
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2009-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <config.h>
-#include <sysdep.h>
-#include <init-arch.h>
-
-#ifdef USE_AS_STRPBRK
-#define STRCSPN_SSE42 __strpbrk_sse42
-#define STRCSPN_SSE2 __strpbrk_sse2
-#define __GI_STRCSPN __GI_strpbrk
-#else
-#ifndef STRCSPN
-#define STRCSPN strcspn
-#define STRCSPN_SSE42 __strcspn_sse42
-#define STRCSPN_SSE2 __strcspn_sse2
-#define __GI_STRCSPN __GI_strcspn
-#endif
-#endif
-
-/* Define multiple versions only for the definition in libc. Don't
- define multiple versions for strpbrk in static library since we
- need strpbrk before the initialization happened. */
-#if (defined SHARED || !defined USE_AS_STRPBRK) && IS_IN (libc)
- .text
-ENTRY(STRCSPN)
- .type STRCSPN, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- leaq STRCSPN_SSE2(%rip), %rax
- HAS_CPU_FEATURE (SSE4_2)
- jz 2f
- leaq STRCSPN_SSE42(%rip), %rax
-2: ret
-END(STRCSPN)
-
-# undef ENTRY
-# define ENTRY(name) \
- .type STRCSPN_SSE2, @function; \
- .globl STRCSPN_SSE2; \
- .align 16; \
- STRCSPN_SSE2: cfi_startproc; \
- CALL_MCOUNT
-# undef END
-# define END(name) \
- cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2
-#endif
-
-#ifdef USE_AS_STRPBRK
-#include "../strpbrk.S"
-#else
-#include "../strcspn.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S b/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
deleted file mode 100644
index 6728678688..0000000000
--- a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#define USE_SSSE3 1
-#define USE_AS_STRNCASECMP_L
-#define NO_NOLOCALE_ALIAS
-#define STRCMP __strncasecmp_l_ssse3
-#define __strncasecmp __strncasecmp_ssse3
-#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strncase_l.S b/sysdeps/x86_64/multiarch/strncase_l.S
deleted file mode 100644
index 9c0149788e..0000000000
--- a/sysdeps/x86_64/multiarch/strncase_l.S
+++ /dev/null
@@ -1,8 +0,0 @@
-/* Multiple versions of strncasecmp and strncasecmp_l
- All versions must be listed in ifunc-impl-list.c. */
-#define STRCMP __strncasecmp_l
-#define USE_AS_STRNCASECMP_L
-#include "strcmp.S"
-
-weak_alias (__strncasecmp_l, strncasecmp_l)
-libc_hidden_def (strncasecmp_l)
diff --git a/sysdeps/x86_64/multiarch/strncat-c.c b/sysdeps/x86_64/multiarch/strncat-c.c
deleted file mode 100644
index a3cdbff689..0000000000
--- a/sysdeps/x86_64/multiarch/strncat-c.c
+++ /dev/null
@@ -1,8 +0,0 @@
-#define STRNCAT __strncat_sse2
-#ifdef SHARED
-#undef libc_hidden_def
-#define libc_hidden_def(name) \
- __hidden_ver1 (__strncat_sse2, __GI___strncat, __strncat_sse2);
-#endif
-
-#include "string/strncat.c"
diff --git a/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
deleted file mode 100644
index 133e1d20b0..0000000000
--- a/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCAT
-#define STRCAT __strncat_sse2_unaligned
-#include "strcat-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S b/sysdeps/x86_64/multiarch/strncat-ssse3.S
deleted file mode 100644
index 6c45ff3ec7..0000000000
--- a/sysdeps/x86_64/multiarch/strncat-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCAT
-#define STRCAT __strncat_ssse3
-#include "strcat-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/strncat.S b/sysdeps/x86_64/multiarch/strncat.S
deleted file mode 100644
index 5c1bf41453..0000000000
--- a/sysdeps/x86_64/multiarch/strncat.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Multiple versions of strncat
- All versions must be listed in ifunc-impl-list.c. */
-#define STRCAT strncat
-#define USE_AS_STRNCAT
-#include "strcat.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/sysdeps/x86_64/multiarch/strncmp-ssse3.S
deleted file mode 100644
index 96380a46be..0000000000
--- a/sysdeps/x86_64/multiarch/strncmp-ssse3.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifdef SHARED
-# define USE_SSSE3 1
-# define STRCMP __strncmp_ssse3
-# define USE_AS_STRNCMP
-# include "../strcmp.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/strncmp.S b/sysdeps/x86_64/multiarch/strncmp.S
deleted file mode 100644
index fd5eb1397c..0000000000
--- a/sysdeps/x86_64/multiarch/strncmp.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Multiple versions of strncmp
- All versions must be listed in ifunc-impl-list.c. */
-#define STRCMP strncmp
-#define USE_AS_STRNCMP
-#include "strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strncpy-c.c b/sysdeps/x86_64/multiarch/strncpy-c.c
deleted file mode 100644
index 296c32cb5d..0000000000
--- a/sysdeps/x86_64/multiarch/strncpy-c.c
+++ /dev/null
@@ -1,8 +0,0 @@
-#define STRNCPY __strncpy_sse2
-#ifdef SHARED
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(name) \
- __hidden_ver1 (__strncpy_sse2, __GI_strncpy, __strncpy_sse2);
-#endif
-
-#include "strncpy.c"
diff --git a/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S
deleted file mode 100644
index fcc23a754a..0000000000
--- a/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCPY
-#define STRCPY __strncpy_sse2_unaligned
-#include "strcpy-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
deleted file mode 100644
index bf82ee447d..0000000000
--- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCPY
-#define STRCPY __strncpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/strncpy.S b/sysdeps/x86_64/multiarch/strncpy.S
deleted file mode 100644
index 6d87a0ba35..0000000000
--- a/sysdeps/x86_64/multiarch/strncpy.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Multiple versions of strncpy
- All versions must be listed in ifunc-impl-list.c. */
-#define STRCPY strncpy
-#define USE_AS_STRNCPY
-#include "strcpy.S"
diff --git a/sysdeps/x86_64/multiarch/strpbrk-c.c b/sysdeps/x86_64/multiarch/strpbrk-c.c
deleted file mode 100644
index bbf5c49d89..0000000000
--- a/sysdeps/x86_64/multiarch/strpbrk-c.c
+++ /dev/null
@@ -1,8 +0,0 @@
-/* Don't define multiple versions for strpbrk in static library since we
- need strpbrk before the initialization happened. */
-#ifdef SHARED
-# define USE_AS_STRPBRK
-# define STRCSPN_SSE2 __strpbrk_sse2
-# define STRCSPN_SSE42 __strpbrk_sse42
-# include "strcspn-c.c"
-#endif
diff --git a/sysdeps/x86_64/multiarch/strpbrk.S b/sysdeps/x86_64/multiarch/strpbrk.S
deleted file mode 100644
index 7201d6376f..0000000000
--- a/sysdeps/x86_64/multiarch/strpbrk.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* Multiple versions of strpbrk
- All versions must be listed in ifunc-impl-list.c. */
-#define STRCSPN strpbrk
-#define USE_AS_STRPBRK
-#include "strcspn.S"
diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
deleted file mode 100644
index 1704606b80..0000000000
--- a/sysdeps/x86_64/multiarch/strspn-c.c
+++ /dev/null
@@ -1,145 +0,0 @@
-/* strspn with SSE4.2 intrinsics
- Copyright (C) 2009-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <nmmintrin.h>
-#include <string.h>
-#include "varshift.h"
-
-/* We use 0x12:
- _SIDD_SBYTE_OPS
- | _SIDD_CMP_EQUAL_ANY
- | _SIDD_NEGATIVE_POLARITY
- | _SIDD_LEAST_SIGNIFICANT
- on pcmpistri to compare xmm/mem128
-
- 0 1 2 3 4 5 6 7 8 9 A B C D E F
- X X X X X X X X X X X X X X X X
-
- against xmm
-
- 0 1 2 3 4 5 6 7 8 9 A B C D E F
- A A A A A A A A A A A A A A A A
-
- to find out if the first 16byte data element has any non-A byte and
- the offset of the first byte. There are 2 cases:
-
- 1. The first 16byte data element has the non-A byte, including
- EOS, at the offset X.
- 2. The first 16byte data element is valid and doesn't have the non-A
- byte.
-
- Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
-
- case ECX CFlag ZFlag SFlag
- 1 X 1 0/1 0
- 2 16 0 0 0
-
- We exit from the loop for case 1. */
-
-extern size_t __strspn_sse2 (const char *, const char *);
-
-
-size_t
-__attribute__ ((section (".text.sse4.2")))
-__strspn_sse42 (const char *s, const char *a)
-{
- if (*a == 0)
- return 0;
-
- const char *aligned;
- __m128i mask;
- int offset = (int) ((size_t) a & 15);
- if (offset != 0)
- {
- /* Load masks. */
- aligned = (const char *) ((size_t) a & -16L);
- __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
-
- mask = __m128i_shift_right (mask0, offset);
-
- /* Find where the NULL terminator is. */
- int length = _mm_cmpistri (mask, mask, 0x3a);
- if (length == 16 - offset)
- {
- /* There is no NULL terminator. */
- __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
- int index = _mm_cmpistri (mask1, mask1, 0x3a);
- length += index;
-
- /* Don't use SSE4.2 if the length of A > 16. */
- if (length > 16)
- return __strspn_sse2 (s, a);
-
- if (index != 0)
- {
- /* Combine mask0 and mask1. We could play games with
- palignr, but frankly this data should be in L1 now
- so do the merge via an unaligned load. */
- mask = _mm_loadu_si128 ((__m128i *) a);
- }
- }
- }
- else
- {
- /* A is aligned. */
- mask = _mm_load_si128 ((__m128i *) a);
-
- /* Find where the NULL terminator is. */
- int length = _mm_cmpistri (mask, mask, 0x3a);
- if (length == 16)
- {
- /* There is no NULL terminator. Don't use SSE4.2 if the length
- of A > 16. */
- if (a[16] != 0)
- return __strspn_sse2 (s, a);
- }
- }
-
- offset = (int) ((size_t) s & 15);
- if (offset != 0)
- {
- /* Check partial string. */
- aligned = (const char *) ((size_t) s & -16L);
- __m128i value = _mm_load_si128 ((__m128i *) aligned);
-
- value = __m128i_shift_right (value, offset);
-
- int length = _mm_cmpistri (mask, value, 0x12);
- /* No need to check CFlag since it is always 1. */
- if (length < 16 - offset)
- return length;
- /* Find where the NULL terminator is. */
- int index = _mm_cmpistri (value, value, 0x3a);
- if (index < 16 - offset)
- return length;
- aligned += 16;
- }
- else
- aligned = s;
-
- while (1)
- {
- __m128i value = _mm_load_si128 ((__m128i *) aligned);
- int index = _mm_cmpistri (mask, value, 0x12);
- int cflag = _mm_cmpistrc (mask, value, 0x12);
- if (cflag)
- return (size_t) (aligned + index - s);
- aligned += 16;
- }
-}
diff --git a/sysdeps/x86_64/multiarch/strspn.S b/sysdeps/x86_64/multiarch/strspn.S
deleted file mode 100644
index adf7d9e533..0000000000
--- a/sysdeps/x86_64/multiarch/strspn.S
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Multiple versions of strspn
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2009-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <config.h>
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in libc. */
-#if IS_IN (libc)
- .text
-ENTRY(strspn)
- .type strspn, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- leaq __strspn_sse2(%rip), %rax
- HAS_CPU_FEATURE (SSE4_2)
- jz 2f
- leaq __strspn_sse42(%rip), %rax
-2: ret
-END(strspn)
-
-# undef ENTRY
-# define ENTRY(name) \
- .type __strspn_sse2, @function; \
- .globl __strspn_sse2; \
- .align 16; \
- __strspn_sse2: cfi_startproc; \
- CALL_MCOUNT
-# undef END
-# define END(name) \
- cfi_endproc; .size __strspn_sse2, .-__strspn_sse2
-#endif
-
-#include "../strspn.S"
diff --git a/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S
deleted file mode 100644
index 138979d10a..0000000000
--- a/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S
+++ /dev/null
@@ -1,374 +0,0 @@
-/* strstr with unaligned loads
- Copyright (C) 2009-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-ENTRY(__strstr_sse2_unaligned)
- movzbl (%rsi), %eax
- testb %al, %al
- je L(empty)
- movzbl 1(%rsi), %edx
- testb %dl, %dl
- je L(strchr)
- movd %eax, %xmm1
- movd %edx, %xmm2
- movq %rdi, %rax
- andl $4095, %eax
- punpcklbw %xmm1, %xmm1
- cmpq $4031, %rax
- punpcklbw %xmm2, %xmm2
- punpcklwd %xmm1, %xmm1
- punpcklwd %xmm2, %xmm2
- pshufd $0, %xmm1, %xmm1
- pshufd $0, %xmm2, %xmm2
- ja L(cross_page)
- movdqu (%rdi), %xmm3
- pxor %xmm5, %xmm5
- movdqu 1(%rdi), %xmm4
- movdqa %xmm3, %xmm6
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm2, %xmm4
- movdqu 16(%rdi), %xmm0
- pcmpeqb %xmm5, %xmm6
- pminub %xmm4, %xmm3
- movdqa %xmm3, %xmm4
- movdqu 17(%rdi), %xmm3
- pcmpeqb %xmm0, %xmm5
- pcmpeqb %xmm2, %xmm3
- por %xmm6, %xmm4
- pcmpeqb %xmm1, %xmm0
- pminub %xmm3, %xmm0
- por %xmm5, %xmm0
- pmovmskb %xmm4, %r8d
- pmovmskb %xmm0, %eax
- salq $16, %rax
- orq %rax, %r8
- je L(next_32_bytes)
-L(next_pair_index):
- bsf %r8, %rax
- addq %rdi, %rax
- cmpb $0, (%rax)
- je L(zero1)
- movzbl 2(%rsi), %edx
- testb %dl, %dl
- je L(found1)
- cmpb 2(%rax), %dl
- jne L(next_pair)
- xorl %edx, %edx
- jmp L(pair_loop_start)
-
- .p2align 4
-L(strchr):
- movzbl %al, %esi
- jmp __strchr_sse2
-
- .p2align 4
-L(pair_loop):
- addq $1, %rdx
- cmpb 2(%rax,%rdx), %cl
- jne L(next_pair)
-L(pair_loop_start):
- movzbl 3(%rsi,%rdx), %ecx
- testb %cl, %cl
- jne L(pair_loop)
-L(found1):
- ret
-L(zero1):
- xorl %eax, %eax
- ret
-
- .p2align 4
-L(next_pair):
- leaq -1(%r8), %rax
- andq %rax, %r8
- jne L(next_pair_index)
-
- .p2align 4
-L(next_32_bytes):
- movdqu 32(%rdi), %xmm3
- pxor %xmm5, %xmm5
- movdqu 33(%rdi), %xmm4
- movdqa %xmm3, %xmm6
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm2, %xmm4
- movdqu 48(%rdi), %xmm0
- pcmpeqb %xmm5, %xmm6
- pminub %xmm4, %xmm3
- movdqa %xmm3, %xmm4
- movdqu 49(%rdi), %xmm3
- pcmpeqb %xmm0, %xmm5
- pcmpeqb %xmm2, %xmm3
- por %xmm6, %xmm4
- pcmpeqb %xmm1, %xmm0
- pminub %xmm3, %xmm0
- por %xmm5, %xmm0
- pmovmskb %xmm4, %eax
- salq $32, %rax
- pmovmskb %xmm0, %r8d
- salq $48, %r8
- orq %rax, %r8
- je L(loop_header)
-L(next_pair2_index):
- bsfq %r8, %rax
- addq %rdi, %rax
- cmpb $0, (%rax)
- je L(zero2)
- movzbl 2(%rsi), %edx
- testb %dl, %dl
- je L(found2)
- cmpb 2(%rax), %dl
- jne L(next_pair2)
- xorl %edx, %edx
- jmp L(pair_loop2_start)
-
- .p2align 4
-L(pair_loop2):
- addq $1, %rdx
- cmpb 2(%rax,%rdx), %cl
- jne L(next_pair2)
-L(pair_loop2_start):
- movzbl 3(%rsi,%rdx), %ecx
- testb %cl, %cl
- jne L(pair_loop2)
-L(found2):
- ret
- L(zero2):
- xorl %eax, %eax
- ret
-L(empty):
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(next_pair2):
- leaq -1(%r8), %rax
- andq %rax, %r8
- jne L(next_pair2_index)
-L(loop_header):
- movq $-512, %r11
- movq %rdi, %r9
-
- pxor %xmm7, %xmm7
- andq $-64, %rdi
-
- .p2align 4
-L(loop):
- movdqa 64(%rdi), %xmm3
- movdqu 63(%rdi), %xmm6
- movdqa %xmm3, %xmm0
- pxor %xmm2, %xmm3
- pxor %xmm1, %xmm6
- movdqa 80(%rdi), %xmm10
- por %xmm3, %xmm6
- pminub %xmm10, %xmm0
- movdqu 79(%rdi), %xmm3
- pxor %xmm2, %xmm10
- pxor %xmm1, %xmm3
- movdqa 96(%rdi), %xmm9
- por %xmm10, %xmm3
- pminub %xmm9, %xmm0
- pxor %xmm2, %xmm9
- movdqa 112(%rdi), %xmm8
- addq $64, %rdi
- pminub %xmm6, %xmm3
- movdqu 31(%rdi), %xmm4
- pminub %xmm8, %xmm0
- pxor %xmm2, %xmm8
- pxor %xmm1, %xmm4
- por %xmm9, %xmm4
- pminub %xmm4, %xmm3
- movdqu 47(%rdi), %xmm5
- pxor %xmm1, %xmm5
- por %xmm8, %xmm5
- pminub %xmm5, %xmm3
- pminub %xmm3, %xmm0
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm0, %eax
- testl %eax, %eax
- je L(loop)
- pminub (%rdi), %xmm6
- pminub 32(%rdi),%xmm4
- pminub 48(%rdi),%xmm5
- pcmpeqb %xmm7, %xmm6
- pcmpeqb %xmm7, %xmm5
- pmovmskb %xmm6, %edx
- movdqa 16(%rdi), %xmm8
- pcmpeqb %xmm7, %xmm4
- movdqu 15(%rdi), %xmm0
- pmovmskb %xmm5, %r8d
- movdqa %xmm8, %xmm3
- pmovmskb %xmm4, %ecx
- pcmpeqb %xmm1,%xmm0
- pcmpeqb %xmm2,%xmm3
- salq $32, %rcx
- pcmpeqb %xmm7,%xmm8
- salq $48, %r8
- pminub %xmm0,%xmm3
- orq %rcx, %rdx
- por %xmm3,%xmm8
- orq %rdx, %r8
- pmovmskb %xmm8, %eax
- salq $16, %rax
- orq %rax, %r8
- je L(loop)
-L(next_pair_index3):
- bsfq %r8, %rcx
- addq %rdi, %rcx
- cmpb $0, (%rcx)
- je L(zero)
- xorl %eax, %eax
- movzbl 2(%rsi), %edx
- testb %dl, %dl
- je L(success3)
- cmpb 1(%rcx), %dl
- jne L(next_pair3)
- jmp L(pair_loop_start3)
-
- .p2align 4
-L(pair_loop3):
- addq $1, %rax
- cmpb 1(%rcx,%rax), %dl
- jne L(next_pair3)
-L(pair_loop_start3):
- movzbl 3(%rsi,%rax), %edx
- testb %dl, %dl
- jne L(pair_loop3)
-L(success3):
- lea -1(%rcx), %rax
- ret
-
- .p2align 4
-L(next_pair3):
- addq %rax, %r11
- movq %rdi, %rax
- subq %r9, %rax
- cmpq %r11, %rax
- jl L(switch_strstr)
- leaq -1(%r8), %rax
- andq %rax, %r8
- jne L(next_pair_index3)
- jmp L(loop)
-
- .p2align 4
-L(switch_strstr):
- movq %rdi, %rdi
- jmp __strstr_sse2
-
- .p2align 4
-L(cross_page):
-
- movq %rdi, %rax
- pxor %xmm0, %xmm0
- andq $-64, %rax
- movdqa (%rax), %xmm3
- movdqu -1(%rax), %xmm4
- movdqa %xmm3, %xmm8
- movdqa 16(%rax), %xmm5
- pcmpeqb %xmm1, %xmm4
- pcmpeqb %xmm0, %xmm8
- pcmpeqb %xmm2, %xmm3
- movdqa %xmm5, %xmm7
- pminub %xmm4, %xmm3
- movdqu 15(%rax), %xmm4
- pcmpeqb %xmm0, %xmm7
- por %xmm3, %xmm8
- movdqa %xmm5, %xmm3
- movdqa 32(%rax), %xmm5
- pcmpeqb %xmm1, %xmm4
- pcmpeqb %xmm2, %xmm3
- movdqa %xmm5, %xmm6
- pmovmskb %xmm8, %ecx
- pminub %xmm4, %xmm3
- movdqu 31(%rax), %xmm4
- por %xmm3, %xmm7
- movdqa %xmm5, %xmm3
- pcmpeqb %xmm0, %xmm6
- movdqa 48(%rax), %xmm5
- pcmpeqb %xmm1, %xmm4
- pmovmskb %xmm7, %r8d
- pcmpeqb %xmm2, %xmm3
- pcmpeqb %xmm5, %xmm0
- pminub %xmm4, %xmm3
- movdqu 47(%rax), %xmm4
- por %xmm3, %xmm6
- movdqa %xmm5, %xmm3
- salq $16, %r8
- pcmpeqb %xmm1, %xmm4
- pcmpeqb %xmm2, %xmm3
- pmovmskb %xmm6, %r10d
- pminub %xmm4, %xmm3
- por %xmm3, %xmm0
- salq $32, %r10
- orq %r10, %r8
- orq %rcx, %r8
- movl %edi, %ecx
- pmovmskb %xmm0, %edx
- subl %eax, %ecx
- salq $48, %rdx
- orq %rdx, %r8
- shrq %cl, %r8
- je L(loop_header)
-L(next_pair_index4):
- bsfq %r8, %rax
- addq %rdi, %rax
- cmpb $0, (%rax)
- je L(zero)
-
- cmpq %rax,%rdi
- je L(next_pair4)
-
- movzbl 2(%rsi), %edx
- testb %dl, %dl
- je L(found3)
- cmpb 1(%rax), %dl
- jne L(next_pair4)
- xorl %edx, %edx
- jmp L(pair_loop_start4)
-
- .p2align 4
-L(pair_loop4):
- addq $1, %rdx
- cmpb 1(%rax,%rdx), %cl
- jne L(next_pair4)
-L(pair_loop_start4):
- movzbl 3(%rsi,%rdx), %ecx
- testb %cl, %cl
- jne L(pair_loop4)
-L(found3):
- subq $1, %rax
- ret
-
- .p2align 4
-L(next_pair4):
- leaq -1(%r8), %rax
- andq %rax, %r8
- jne L(next_pair_index4)
- jmp L(loop_header)
-
- .p2align 4
-L(found):
- rep
- ret
-
- .p2align 4
-L(zero):
- xorl %eax, %eax
- ret
-
-
-END(__strstr_sse2_unaligned)
diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
deleted file mode 100644
index a7d181d797..0000000000
--- a/sysdeps/x86_64/multiarch/strstr.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Multiple versions of strstr.
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2012-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-/* Redefine strstr so that the compiler won't complain about the type
- mismatch with the IFUNC selector in strong_alias, below. */
-#undef strstr
-#define strstr __redirect_strstr
-#include <string.h>
-#undef strstr
-
-#define STRSTR __strstr_sse2
-#ifdef SHARED
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(name) \
- __hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2);
-#endif
-
-#include "string/strstr.c"
-
-extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
-extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
-
-#include "init-arch.h"
-
-/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
- ifunc symbol properly. */
-extern __typeof (__redirect_strstr) __libc_strstr;
-libc_ifunc (__libc_strstr,
- HAS_ARCH_FEATURE (Fast_Unaligned_Load)
- ? __strstr_sse2_unaligned
- : __strstr_sse2)
-
-#undef strstr
-strong_alias (__libc_strstr, strstr)
diff --git a/sysdeps/x86_64/multiarch/test-multiarch.c b/sysdeps/x86_64/multiarch/test-multiarch.c
deleted file mode 100644
index 597d64e1e8..0000000000
--- a/sysdeps/x86_64/multiarch/test-multiarch.c
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Test CPU feature data.
- This file is part of the GNU C Library.
- Copyright (C) 2012-2017 Free Software Foundation, Inc.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <cpu-features.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-static char *cpu_flags;
-
-/* Search for flags in /proc/cpuinfo and store line
- in cpu_flags. */
-void
-get_cpuinfo (void)
-{
- FILE *f;
- char *line = NULL;
- size_t len = 0;
- ssize_t read;
-
- f = fopen ("/proc/cpuinfo", "r");
- if (f == NULL)
- {
- printf ("cannot open /proc/cpuinfo\n");
- exit (1);
- }
-
- while ((read = getline (&line, &len, f)) != -1)
- {
- if (strncmp (line, "flags", 5) == 0)
- {
- cpu_flags = strdup (line);
- break;
- }
- }
- fclose (f);
- free (line);
-}
-
-int
-check_proc (const char *proc_name, int flag, const char *name)
-{
- int found = 0;
-
- printf ("Checking %s:\n", name);
- printf (" init-arch %d\n", flag);
- if (strstr (cpu_flags, proc_name) != NULL)
- found = 1;
- printf (" cpuinfo (%s) %d\n", proc_name, found);
-
- if (found != flag)
- printf (" *** failure ***\n");
-
- return (found != flag);
-}
-
-static int
-do_test (int argc, char **argv)
-{
- int fails;
-
- get_cpuinfo ();
- fails = check_proc ("avx", HAS_ARCH_FEATURE (AVX_Usable),
- "HAS_ARCH_FEATURE (AVX_Usable)");
- fails += check_proc ("fma4", HAS_ARCH_FEATURE (FMA4_Usable),
- "HAS_ARCH_FEATURE (FMA4_Usable)");
- fails += check_proc ("sse4_2", HAS_CPU_FEATURE (SSE4_2),
- "HAS_CPU_FEATURE (SSE4_2)");
- fails += check_proc ("sse4_1", HAS_CPU_FEATURE (SSE4_1)
- , "HAS_CPU_FEATURE (SSE4_1)");
- fails += check_proc ("ssse3", HAS_CPU_FEATURE (SSSE3),
- "HAS_CPU_FEATURE (SSSE3)");
- fails += check_proc ("popcnt", HAS_CPU_FEATURE (POPCOUNT),
- "HAS_CPU_FEATURE (POPCOUNT)");
-
- printf ("%d differences between /proc/cpuinfo and glibc code.\n", fails);
-
- return (fails != 0);
-}
-
-#include "../../../test-skeleton.c"
diff --git a/sysdeps/x86_64/multiarch/varshift.c b/sysdeps/x86_64/multiarch/varshift.c
deleted file mode 100644
index 1c3e34845d..0000000000
--- a/sysdeps/x86_64/multiarch/varshift.c
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Helper for variable shifts of SSE registers.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include "varshift.h"
-
-const int8_t ___m128i_shift_right[31] attribute_hidden =
- {
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
- };
diff --git a/sysdeps/x86_64/multiarch/varshift.h b/sysdeps/x86_64/multiarch/varshift.h
deleted file mode 100644
index 07bb76c4bf..0000000000
--- a/sysdeps/x86_64/multiarch/varshift.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Helper for variable shifts of SSE registers.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <stdint.h>
-#include <tmmintrin.h>
-
-extern const int8_t ___m128i_shift_right[31] attribute_hidden;
-
-static __inline__ __m128i
-__m128i_shift_right (__m128i value, unsigned long int offset)
-{
- return _mm_shuffle_epi8 (value,
- _mm_loadu_si128 ((__m128i *) (___m128i_shift_right
- + offset)));
-}
diff --git a/sysdeps/x86_64/multiarch/wcscpy-c.c b/sysdeps/x86_64/multiarch/wcscpy-c.c
deleted file mode 100644
index a51a83a9be..0000000000
--- a/sysdeps/x86_64/multiarch/wcscpy-c.c
+++ /dev/null
@@ -1,5 +0,0 @@
-#if IS_IN (libc)
-# define wcscpy __wcscpy_sse2
-#endif
-
-#include "wcsmbs/wcscpy.c"
diff --git a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
deleted file mode 100644
index 53857ce4f5..0000000000
--- a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
+++ /dev/null
@@ -1,552 +0,0 @@
-/* wcscpy with SSSE3
- Copyright (C) 2011-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-# include <sysdep.h>
-
- .section .text.ssse3,"ax",@progbits
-ENTRY (__wcscpy_ssse3)
-
- mov %rsi, %rcx
- mov %rdi, %rdx
-
- cmpl $0, (%rcx)
- jz L(Exit4)
- cmpl $0, 4(%rcx)
- jz L(Exit8)
- cmpl $0, 8(%rcx)
- jz L(Exit12)
- cmpl $0, 12(%rcx)
- jz L(Exit16)
-
- lea 16(%rcx), %rsi
- and $-16, %rsi
-
- pxor %xmm0, %xmm0
- mov (%rcx), %r9
- mov %r9, (%rdx)
-
- pcmpeqd (%rsi), %xmm0
- mov 8(%rcx), %r9
- mov %r9, 8(%rdx)
-
- pmovmskb %xmm0, %rax
- sub %rcx, %rsi
-
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- mov %rdx, %rax
- lea 16(%rdx), %rdx
- and $-16, %rdx
- sub %rdx, %rax
- sub %rax, %rcx
- mov %rcx, %rax
- and $0xf, %rax
- mov $0, %rsi
-
-/* case: rcx_offset == rdx_offset */
-
- jz L(Align16Both)
-
- cmp $4, %rax
- je L(Shl4)
- cmp $8, %rax
- je L(Shl8)
- jmp L(Shl12)
-
-L(Align16Both):
- movaps (%rcx), %xmm1
- movaps 16(%rcx), %xmm2
- movaps %xmm1, (%rdx)
- pcmpeqd %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm3
- movaps %xmm2, (%rdx, %rsi)
- pcmpeqd %xmm3, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm4
- movaps %xmm3, (%rdx, %rsi)
- pcmpeqd %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm1
- movaps %xmm4, (%rdx, %rsi)
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm2
- movaps %xmm1, (%rdx, %rsi)
- pcmpeqd %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm3
- movaps %xmm2, (%rdx, %rsi)
- pcmpeqd %xmm3, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps %xmm3, (%rdx, %rsi)
- mov %rcx, %rax
- lea 16(%rcx, %rsi), %rcx
- and $-0x40, %rcx
- sub %rcx, %rax
- sub %rax, %rdx
-
- mov $-0x40, %rsi
-
- .p2align 4
-L(Aligned64Loop):
- movaps (%rcx), %xmm2
- movaps %xmm2, %xmm4
- movaps 16(%rcx), %xmm5
- movaps 32(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 48(%rcx), %xmm7
- pminub %xmm5, %xmm2
- pminub %xmm7, %xmm3
- pminub %xmm2, %xmm3
- pcmpeqd %xmm0, %xmm3
- pmovmskb %xmm3, %rax
- lea 64(%rdx), %rdx
- lea 64(%rcx), %rcx
- test %rax, %rax
- jnz L(Aligned64Leave)
- movaps %xmm4, -64(%rdx)
- movaps %xmm5, -48(%rdx)
- movaps %xmm6, -32(%rdx)
- movaps %xmm7, -16(%rdx)
- jmp L(Aligned64Loop)
-
-L(Aligned64Leave):
- pcmpeqd %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqd %xmm5, %xmm0
-
- pmovmskb %xmm0, %rax
- movaps %xmm4, -64(%rdx)
- test %rax, %rax
- lea 16(%rsi), %rsi
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqd %xmm6, %xmm0
-
- pmovmskb %xmm0, %rax
- movaps %xmm5, -48(%rdx)
- test %rax, %rax
- lea 16(%rsi), %rsi
- jnz L(CopyFrom1To16Bytes)
-
- movaps %xmm6, -32(%rdx)
- pcmpeqd %xmm7, %xmm0
-
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- mov $-0x40, %rsi
- movaps %xmm7, -16(%rdx)
- jmp L(Aligned64Loop)
-
- .p2align 4
-L(Shl4):
- movaps -4(%rcx), %xmm1
- movaps 12(%rcx), %xmm2
-L(Shl4Start):
- pcmpeqd %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqd %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqd %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqd %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 28(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -12(%rcx), %rcx
- sub %rax, %rdx
-
- movaps -4(%rcx), %xmm1
-
- .p2align 4
-L(Shl4LoopStart):
- movaps 12(%rcx), %xmm2
- movaps 28(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 44(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 60(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqd %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $4, %xmm4, %xmm5
- test %rax, %rax
- palignr $4, %xmm3, %xmm4
- jnz L(Shl4Start)
-
- palignr $4, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $4, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl4LoopStart)
-
-L(Shl4LoopExit):
- movdqu -4(%rcx), %xmm1
- mov $12, %rsi
- movdqu %xmm1, -4(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl8):
- movaps -8(%rcx), %xmm1
- movaps 8(%rcx), %xmm2
-L(Shl8Start):
- pcmpeqd %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
-
- pcmpeqd %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
-
- pcmpeqd %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
-
- pcmpeqd %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 24(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -8(%rcx), %rcx
- sub %rax, %rdx
-
- movaps -8(%rcx), %xmm1
-
- .p2align 4
-L(Shl8LoopStart):
- movaps 8(%rcx), %xmm2
- movaps 24(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 40(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 56(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqd %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $8, %xmm4, %xmm5
- test %rax, %rax
- palignr $8, %xmm3, %xmm4
- jnz L(Shl8Start)
-
- palignr $8, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $8, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl8LoopStart)
-
-L(Shl8LoopExit):
- mov (%rcx), %r9
- mov $8, %rsi
- mov %r9, (%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl12):
- movaps -12(%rcx), %xmm1
- movaps 4(%rcx), %xmm2
-L(Shl12Start):
- pcmpeqd %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
-
- pcmpeqd %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
-
- pcmpeqd %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
-
- pcmpeqd %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 20(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -4(%rcx), %rcx
- sub %rax, %rdx
-
- movaps -12(%rcx), %xmm1
-
- .p2align 4
-L(Shl12LoopStart):
- movaps 4(%rcx), %xmm2
- movaps 20(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 36(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 52(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqd %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $12, %xmm4, %xmm5
- test %rax, %rax
- palignr $12, %xmm3, %xmm4
- jnz L(Shl12Start)
- palignr $12, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $12, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl12LoopStart)
-
-L(Shl12LoopExit):
- mov (%rcx), %r9d
- mov $4, %rsi
- mov %r9d, (%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(CopyFrom1To16Bytes):
- add %rsi, %rdx
- add %rsi, %rcx
-
- test %al, %al
- jz L(ExitHigh)
- test $0x01, %al
- jnz L(Exit4)
-
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(ExitHigh):
- test $0x01, %ah
- jnz L(Exit12)
-
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 8(%rcx), %rax
- mov %rax, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(Exit4):
- movl (%rcx), %eax
- movl %eax, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(Exit8):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(Exit12):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 8(%rcx), %eax
- mov %eax, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(Exit16):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 8(%rcx), %rax
- mov %rax, 8(%rdx)
- mov %rdi, %rax
- ret
-
-END(__wcscpy_ssse3)
-#endif
diff --git a/sysdeps/x86_64/multiarch/wcscpy.S b/sysdeps/x86_64/multiarch/wcscpy.S
deleted file mode 100644
index 9150ab6d18..0000000000
--- a/sysdeps/x86_64/multiarch/wcscpy.S
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Multiple versions of wcscpy
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2011-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in libc. */
-#if IS_IN (libc)
-
- .text
-ENTRY(wcscpy)
- .type wcscpy, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- HAS_CPU_FEATURE (SSSE3)
- jnz 2f
- leaq __wcscpy_sse2(%rip), %rax
- ret
-
-2: leaq __wcscpy_ssse3(%rip), %rax
- ret
-
-END(wcscpy)
-#endif
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-c.c b/sysdeps/x86_64/multiarch/wcsnlen-c.c
deleted file mode 100644
index e1ec7cfbb5..0000000000
--- a/sysdeps/x86_64/multiarch/wcsnlen-c.c
+++ /dev/null
@@ -1,9 +0,0 @@
-#if IS_IN (libc)
-# include <wchar.h>
-
-# define WCSNLEN __wcsnlen_sse2
-
-extern __typeof (wcsnlen) __wcsnlen_sse2;
-#endif
-
-#include "wcsmbs/wcsnlen.c"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
deleted file mode 100644
index a8cab0cb00..0000000000
--- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
+++ /dev/null
@@ -1,5 +0,0 @@
-#define AS_WCSLEN
-#define AS_STRNLEN
-#define strlen __wcsnlen_sse4_1
-
-#include "../strlen.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
deleted file mode 100644
index 304f62eec3..0000000000
--- a/sysdeps/x86_64/multiarch/wcsnlen.c
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Multiple versions of wcsnlen.
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-/* Define multiple versions only for the definition in libc. */
-#if IS_IN (libc)
-# define __wcsnlen __redirect_wcsnlen
-# include <wchar.h>
-# undef __wcsnlen
-
-# define SYMBOL_NAME wcsnlen
-# include <init-arch.h>
-
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
-
-static inline void *
-IFUNC_SELECTOR (void)
-{
- const struct cpu_features* cpu_features = __get_cpu_features ();
-
- if (CPU_FEATURES_CPU_P (cpu_features, SSE4_1))
- return OPTIMIZE (sse4_1);
-
- return OPTIMIZE (sse2);
-}
-
-libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ());
-weak_alias (__wcsnlen, wcsnlen);
-#endif
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe.S
deleted file mode 100644
index bfa1a16a35..0000000000
--- a/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define MEMCMP __wmemcmp_avx2_movbe
-#define USE_AS_WMEMCMP 1
-
-#include "memcmp-avx2-movbe.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-c.c b/sysdeps/x86_64/multiarch/wmemcmp-c.c
deleted file mode 100644
index 46b6715e18..0000000000
--- a/sysdeps/x86_64/multiarch/wmemcmp-c.c
+++ /dev/null
@@ -1,9 +0,0 @@
-#if IS_IN (libc)
-# include <wchar.h>
-
-# define WMEMCMP __wmemcmp_sse2
-
-extern __typeof (wmemcmp) __wmemcmp_sse2;
-#endif
-
-#include "wcsmbs/wmemcmp.c"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-sse4.S b/sysdeps/x86_64/multiarch/wmemcmp-sse4.S
deleted file mode 100644
index b07973a4f6..0000000000
--- a/sysdeps/x86_64/multiarch/wmemcmp-sse4.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_WMEMCMP 1
-#define MEMCMP __wmemcmp_sse4_1
-
-#include "memcmp-sse4.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
deleted file mode 100644
index a41ef95fc1..0000000000
--- a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_WMEMCMP 1
-#define MEMCMP __wmemcmp_ssse3
-
-#include "memcmp-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S
deleted file mode 100644
index 94b25a214c..0000000000
--- a/sysdeps/x86_64/multiarch/wmemcmp.S
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Multiple versions of wmemcmp
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2011-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in libc. */
-#if IS_IN (libc)
- .text
-ENTRY(wmemcmp)
- .type wmemcmp, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
- jnz 1f
- HAS_ARCH_FEATURE (AVX2_Usable)
- jz 1f
- HAS_CPU_FEATURE (MOVBE)
- jz 1f
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz 1f
- leaq __wmemcmp_avx2_movbe(%rip), %rax
- ret
-
-1: HAS_CPU_FEATURE (SSSE3)
- jnz 2f
- leaq __wmemcmp_sse2(%rip), %rax
- ret
-
-2: HAS_CPU_FEATURE (SSE4_1)
- jz 3f
- leaq __wmemcmp_sse4_1(%rip), %rax
- ret
-
-3: leaq __wmemcmp_ssse3(%rip), %rax
- ret
-
-END(wmemcmp)
-#endif
diff --git a/sysdeps/x86_64/multiarch/wmemset.c b/sysdeps/x86_64/multiarch/wmemset.c
deleted file mode 100644
index dd35be6e49..0000000000
--- a/sysdeps/x86_64/multiarch/wmemset.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Multiple versions of wmemset.
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-/* Define multiple versions only for the definition in libc. */
-#if IS_IN (libc)
-# define wmemset __redirect_wmemset
-# define __wmemset __redirect___wmemset
-# include <wchar.h>
-# undef wmemset
-# undef __wmemset
-
-# define SYMBOL_NAME wmemset
-# include "ifunc-wmemset.h"
-
-libc_ifunc_redirected (__redirect_wmemset, __wmemset, IFUNC_SELECTOR ());
-weak_alias (__wmemset, wmemset)
-#endif
diff --git a/sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S b/sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S
deleted file mode 100644
index 0a537fe272..0000000000
--- a/sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Non-shared version of wmemset_chk for x86-64.
- Copyright (C) 2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc) && !defined SHARED
-# include "../wmemset_chk.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/wmemset_chk.c b/sysdeps/x86_64/multiarch/wmemset_chk.c
deleted file mode 100644
index d3ded5595b..0000000000
--- a/sysdeps/x86_64/multiarch/wmemset_chk.c
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Multiple versions of wmemset_chk.
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-/* Define multiple versions only for the definition in libc.so. */
-#if IS_IN (libc) && defined SHARED
-# define __wmemset_chk __redirect_wmemset_chk
-# include <wchar.h>
-# undef __wmemset_chk
-
-# define SYMBOL_NAME wmemset_chk
-# include "ifunc-wmemset.h"
-
-libc_ifunc_redirected (__redirect_wmemset_chk, __wmemset_chk,
- IFUNC_SELECTOR ());
-#endif