aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRaoni Fassina Firmino <raoni@linux.ibm.com>2022-02-25 10:45:14 -0300
committerRaoni Fassina Firmino <raoni@linux.ibm.com>2022-02-25 10:45:14 -0300
commit821f0a9cb24486842f16f9b0d81d9c28e876c6ae (patch)
tree690876c3a706cb222b159e2ebff02d218d2fd1fb
parent889122cbfacedab8fdb62a9d3fd6244d528ea99c (diff)
parent178292fe011d5745aa0dd7ea99b388c30512fbdb (diff)
downloadglibc-821f0a9cb24486842f16f9b0d81d9c28e876c6ae.tar
glibc-821f0a9cb24486842f16f9b0d81d9c28e876c6ae.tar.gz
glibc-821f0a9cb24486842f16f9b0d81d9c28e876c6ae.tar.bz2
glibc-821f0a9cb24486842f16f9b0d81d9c28e876c6ae.zip
Merge branch release/2.30/master into ibm/2.30/master
-rw-r--r--NEWS2
-rw-r--r--elf/Makefile2
-rw-r--r--elf/dl-tunables.c56
-rw-r--r--elf/tst-env-setuid-tunables.c118
-rw-r--r--elf/tst-env-setuid.c197
-rw-r--r--stdlib/tst-secure-getenv.c199
-rw-r--r--string/test-strnlen.c34
-rw-r--r--support/capture_subprocess.h6
-rw-r--r--support/subprocess.h5
-rw-r--r--support/support_capture_subprocess.c128
-rw-r--r--support/support_subprocess.c21
-rw-r--r--sysdeps/s390/configure8
-rw-r--r--sysdeps/s390/configure.ac8
-rw-r--r--sysdeps/s390/memmove.c2
-rw-r--r--sysdeps/s390/multiarch/ifunc-impl-list.c3
-rw-r--r--sysdeps/x86/Makefile27
-rw-r--r--sysdeps/x86/cpu-features.c20
-rw-r--r--sysdeps/x86/cpu-features.h2
-rw-r--r--sysdeps/x86/cpu-tunables.c3
-rw-r--r--sysdeps/x86/tst-memchr-rtm.c54
-rw-r--r--sysdeps/x86/tst-memcmp-rtm.c52
-rw-r--r--sysdeps/x86/tst-memmove-rtm.c53
-rw-r--r--sysdeps/x86/tst-memrchr-rtm.c54
-rw-r--r--sysdeps/x86/tst-memset-rtm.c45
-rw-r--r--sysdeps/x86/tst-strchr-rtm.c54
-rw-r--r--sysdeps/x86/tst-strcpy-rtm.c53
-rw-r--r--sysdeps/x86/tst-string-rtm.h72
-rw-r--r--sysdeps/x86/tst-strlen-rtm.c53
-rw-r--r--sysdeps/x86/tst-strncmp-rtm.c52
-rw-r--r--sysdeps/x86/tst-strrchr-rtm.c53
-rw-r--r--sysdeps/x86_64/memchr.S77
-rw-r--r--sysdeps/x86_64/multiarch/Makefile60
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-avx2.h18
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-impl-list.c402
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-memcmp.h17
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-memmove.h45
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-memset.h49
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-strcpy.h17
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-wcslen.h52
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-wmemset.h22
-rw-r--r--sysdeps/x86_64/multiarch/memchr-avx2-rtm.S12
-rw-r--r--sysdeps/x86_64/multiarch/memchr-avx2.S494
-rw-r--r--sysdeps/x86_64/multiarch/memchr-evex.S478
-rw-r--r--sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S12
-rw-r--r--sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S28
-rw-r--r--sysdeps/x86_64/multiarch/memcmp-evex-movbe.S440
-rw-r--r--sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S17
-rw-r--r--sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S18
-rw-r--r--sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S26
-rw-r--r--sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S57
-rw-r--r--sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S12
-rw-r--r--sysdeps/x86_64/multiarch/memrchr-avx2.S53
-rw-r--r--sysdeps/x86_64/multiarch/memrchr-evex.S337
-rw-r--r--sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S10
-rw-r--r--sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S12
-rw-r--r--sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S16
-rw-r--r--sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S24
-rw-r--r--sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S61
-rw-r--r--sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S4
-rw-r--r--sysdeps/x86_64/multiarch/rawmemchr-evex.S4
-rw-r--r--sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S3
-rw-r--r--sysdeps/x86_64/multiarch/stpcpy-evex.S3
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S4
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy-evex.S4
-rw-r--r--sysdeps/x86_64/multiarch/strcat-avx2-rtm.S12
-rw-r--r--sysdeps/x86_64/multiarch/strcat-avx2.S6
-rw-r--r--sysdeps/x86_64/multiarch/strcat-evex.S283
-rw-r--r--sysdeps/x86_64/multiarch/strchr-avx2-rtm.S12
-rw-r--r--sysdeps/x86_64/multiarch/strchr-avx2.S22
-rw-r--r--sysdeps/x86_64/multiarch/strchr-evex.S335
-rw-r--r--sysdeps/x86_64/multiarch/strchr.c18
-rw-r--r--sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S3
-rw-r--r--sysdeps/x86_64/multiarch/strchrnul-evex.S3
-rw-r--r--sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S12
-rw-r--r--sysdeps/x86_64/multiarch/strcmp-avx2.S65
-rw-r--r--sysdeps/x86_64/multiarch/strcmp-evex.S1043
-rw-r--r--sysdeps/x86_64/multiarch/strcmp.c19
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S12
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-avx2.S85
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-evex.S1003
-rw-r--r--sysdeps/x86_64/multiarch/strlen-avx2-rtm.S12
-rw-r--r--sysdeps/x86_64/multiarch/strlen-avx2.S565
-rw-r--r--sysdeps/x86_64/multiarch/strlen-evex.S436
-rw-r--r--sysdeps/x86_64/multiarch/strlen-sse2.S2
-rw-r--r--sysdeps/x86_64/multiarch/strlen-vec.S257
-rw-r--r--sysdeps/x86_64/multiarch/strncat-avx2-rtm.S3
-rw-r--r--sysdeps/x86_64/multiarch/strncat-evex.S3
-rw-r--r--sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S3
-rw-r--r--sysdeps/x86_64/multiarch/strncmp-evex.S3
-rw-r--r--sysdeps/x86_64/multiarch/strncmp.c19
-rw-r--r--sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S3
-rw-r--r--sysdeps/x86_64/multiarch/strncpy-evex.S3
-rw-r--r--sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S4
-rw-r--r--sysdeps/x86_64/multiarch/strnlen-evex.S4
-rw-r--r--sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S12
-rw-r--r--sysdeps/x86_64/multiarch/strrchr-avx2.S19
-rw-r--r--sysdeps/x86_64/multiarch/strrchr-evex.S265
-rw-r--r--sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S3
-rw-r--r--sysdeps/x86_64/multiarch/wcschr-evex.S3
-rw-r--r--sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S4
-rw-r--r--sysdeps/x86_64/multiarch/wcscmp-evex.S4
-rw-r--r--sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S4
-rw-r--r--sysdeps/x86_64/multiarch/wcslen-evex.S4
-rw-r--r--sysdeps/x86_64/multiarch/wcslen-sse4_1.S4
-rw-r--r--sysdeps/x86_64/multiarch/wcslen.c2
-rw-r--r--sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S5
-rw-r--r--sysdeps/x86_64/multiarch/wcsncmp-evex.S5
-rw-r--r--sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S5
-rw-r--r--sysdeps/x86_64/multiarch/wcsnlen-evex.S5
-rw-r--r--sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S2
-rw-r--r--sysdeps/x86_64/multiarch/wcsnlen.c22
-rw-r--r--sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S3
-rw-r--r--sysdeps/x86_64/multiarch/wcsrchr-evex.S3
-rw-r--r--sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S4
-rw-r--r--sysdeps/x86_64/multiarch/wmemchr-evex.S4
-rw-r--r--sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S4
-rw-r--r--sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S4
-rw-r--r--sysdeps/x86_64/strlen.S243
-rw-r--r--sysdeps/x86_64/sysdep.h22
119 files changed, 7797 insertions, 1432 deletions
diff --git a/NEWS b/NEWS
index 66efd8e32b..f7501ac415 100644
--- a/NEWS
+++ b/NEWS
@@ -73,6 +73,8 @@ The following bugs are resolved with this release:
[25976] nss_compat: internal_end*ent may clobber errno, hiding ERANGE
[27130] "rep movsb" performance issue
[27177] GLIBC_TUNABLES=glibc.cpu.x86_ibt=on:glibc.cpu.x86_shstk=on doesn't work
+ [27457] vzeroupper use in AVX2 multiarch string functions cause HTM aborts
+ [28755] overflow bug in wcsncmp_avx2 and wcsncmp_evex
Version 2.30
diff --git a/elf/Makefile b/elf/Makefile
index 63416b2ffe..b0d0f60edb 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -1514,8 +1514,6 @@ $(objpfx)tst-nodelete-dlclose.out: $(objpfx)tst-nodelete-dlclose-dso.so \
tst-env-setuid-ENV = MALLOC_CHECK_=2 MALLOC_MMAP_THRESHOLD_=4096 \
LD_HWCAP_MASK=0x1
-tst-env-setuid-tunables-ENV = \
- GLIBC_TUNABLES=glibc.malloc.check=2:glibc.malloc.mmap_threshold=4096
$(objpfx)tst-debug1: $(libdl)
$(objpfx)tst-debug1.out: $(objpfx)tst-debug1mod1.so
diff --git a/elf/dl-tunables.c b/elf/dl-tunables.c
index b0980c5ad9..a8425ddc9d 100644
--- a/elf/dl-tunables.c
+++ b/elf/dl-tunables.c
@@ -178,6 +178,7 @@ parse_tunables (char *tunestr, char *valstring)
return;
char *p = tunestr;
+ size_t off = 0;
while (true)
{
@@ -191,7 +192,11 @@ parse_tunables (char *tunestr, char *valstring)
/* If we reach the end of the string before getting a valid name-value
pair, bail out. */
if (p[len] == '\0')
- return;
+ {
+ if (__libc_enable_secure)
+ tunestr[off] = '\0';
+ return;
+ }
/* We did not find a valid name-value pair before encountering the
colon. */
@@ -217,35 +222,28 @@ parse_tunables (char *tunestr, char *valstring)
if (tunable_is_name (cur->name, name))
{
- /* If we are in a secure context (AT_SECURE) then ignore the tunable
- unless it is explicitly marked as secure. Tunable values take
- precendence over their envvar aliases. */
+ /* If we are in a secure context (AT_SECURE) then ignore the
+ tunable unless it is explicitly marked as secure. Tunable
+ values take precedence over their envvar aliases. We write
+ the tunables that are not SXID_ERASE back to TUNESTR, thus
+ dropping all SXID_ERASE tunables and any invalid or
+ unrecognized tunables. */
if (__libc_enable_secure)
{
- if (cur->security_level == TUNABLE_SECLEVEL_SXID_ERASE)
+ if (cur->security_level != TUNABLE_SECLEVEL_SXID_ERASE)
{
- if (p[len] == '\0')
- {
- /* Last tunable in the valstring. Null-terminate and
- return. */
- *name = '\0';
- return;
- }
- else
- {
- /* Remove the current tunable from the string. We do
- this by overwriting the string starting from NAME
- (which is where the current tunable begins) with
- the remainder of the string. We then have P point
- to NAME so that we continue in the correct
- position in the valstring. */
- char *q = &p[len + 1];
- p = name;
- while (*q != '\0')
- *name++ = *q++;
- name[0] = '\0';
- len = 0;
- }
+ if (off > 0)
+ tunestr[off++] = ':';
+
+ const char *n = cur->name;
+
+ while (*n != '\0')
+ tunestr[off++] = *n++;
+
+ tunestr[off++] = '=';
+
+ for (size_t j = 0; j < len; j++)
+ tunestr[off++] = value[j];
}
if (cur->security_level != TUNABLE_SECLEVEL_NONE)
@@ -258,9 +256,7 @@ parse_tunables (char *tunestr, char *valstring)
}
}
- if (p[len] == '\0')
- return;
- else
+ if (p[len] != '\0')
p += len + 1;
}
}
diff --git a/elf/tst-env-setuid-tunables.c b/elf/tst-env-setuid-tunables.c
index e92e9f5fbf..0e9aedeb2e 100644
--- a/elf/tst-env-setuid-tunables.c
+++ b/elf/tst-env-setuid-tunables.c
@@ -25,35 +25,76 @@
#include "config.h"
#undef _LIBC
-#define test_parent test_parent_tunables
-#define test_child test_child_tunables
-
-static int test_child_tunables (void);
-static int test_parent_tunables (void);
-
-#include "tst-env-setuid.c"
-
-#define CHILD_VALSTRING_VALUE "glibc.malloc.mmap_threshold=4096"
-#define PARENT_VALSTRING_VALUE \
- "glibc.malloc.check=2:glibc.malloc.mmap_threshold=4096"
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <intprops.h>
+#include <array_length.h>
+
+#include <support/check.h>
+#include <support/support.h>
+#include <support/test-driver.h>
+#include <support/capture_subprocess.h>
+
+const char *teststrings[] =
+{
+ "glibc.malloc.check=2:glibc.malloc.mmap_threshold=4096",
+ "glibc.malloc.check=2:glibc.malloc.check=2:glibc.malloc.mmap_threshold=4096",
+ "glibc.malloc.check=2:glibc.malloc.mmap_threshold=4096:glibc.malloc.check=2",
+ "glibc.malloc.perturb=0x800",
+ "glibc.malloc.perturb=0x800:glibc.malloc.mmap_threshold=4096",
+ "glibc.malloc.perturb=0x800:not_valid.malloc.check=2:glibc.malloc.mmap_threshold=4096",
+ "glibc.not_valid.check=2:glibc.malloc.mmap_threshold=4096",
+ "not_valid.malloc.check=2:glibc.malloc.mmap_threshold=4096",
+ "glibc.malloc.garbage=2:glibc.maoc.mmap_threshold=4096:glibc.malloc.check=2",
+ "glibc.malloc.check=4:glibc.malloc.garbage=2:glibc.maoc.mmap_threshold=4096",
+ ":glibc.malloc.garbage=2:glibc.malloc.check=1",
+ "glibc.malloc.check=1:glibc.malloc.check=2",
+ "not_valid.malloc.check=2",
+ "glibc.not_valid.check=2",
+};
+
+const char *resultstrings[] =
+{
+ "glibc.malloc.mmap_threshold=4096",
+ "glibc.malloc.mmap_threshold=4096",
+ "glibc.malloc.mmap_threshold=4096",
+ "glibc.malloc.perturb=0x800",
+ "glibc.malloc.perturb=0x800:glibc.malloc.mmap_threshold=4096",
+ "glibc.malloc.perturb=0x800:glibc.malloc.mmap_threshold=4096",
+ "glibc.malloc.mmap_threshold=4096",
+ "glibc.malloc.mmap_threshold=4096",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+};
static int
-test_child_tunables (void)
+test_child (int off)
{
const char *val = getenv ("GLIBC_TUNABLES");
#if HAVE_TUNABLES
- if (val != NULL && strcmp (val, CHILD_VALSTRING_VALUE) == 0)
+ if (val != NULL && strcmp (val, resultstrings[off]) == 0)
return 0;
if (val != NULL)
- printf ("Unexpected GLIBC_TUNABLES VALUE %s\n", val);
+ printf ("[%d] Unexpected GLIBC_TUNABLES VALUE %s\n", off, val);
return 1;
#else
if (val != NULL)
{
- printf ("GLIBC_TUNABLES not cleared\n");
+ printf ("[%d] GLIBC_TUNABLES not cleared\n", off);
return 1;
}
return 0;
@@ -61,15 +102,48 @@ test_child_tunables (void)
}
static int
-test_parent_tunables (void)
+do_test (int argc, char **argv)
{
- const char *val = getenv ("GLIBC_TUNABLES");
+ /* Setgid child process. */
+ if (argc == 2)
+ {
+ if (getgid () == getegid ())
+ /* This can happen if the file system is mounted nosuid. */
+ FAIL_UNSUPPORTED ("SGID failed: GID and EGID match (%jd)\n",
+ (intmax_t) getgid ());
- if (val != NULL && strcmp (val, PARENT_VALSTRING_VALUE) == 0)
- return 0;
+ int ret = test_child (atoi (argv[1]));
- if (val != NULL)
- printf ("Unexpected GLIBC_TUNABLES VALUE %s\n", val);
+ if (ret != 0)
+ exit (1);
- return 1;
+ exit (EXIT_SUCCESS);
+ }
+ else
+ {
+ int ret = 0;
+
+ /* Spawn tests. */
+ for (int i = 0; i < array_length (teststrings); i++)
+ {
+ char buf[INT_BUFSIZE_BOUND (int)];
+
+ printf ("Spawned test for %s (%d)\n", teststrings[i], i);
+ snprintf (buf, sizeof (buf), "%d\n", i);
+ if (setenv ("GLIBC_TUNABLES", teststrings[i], 1) != 0)
+ exit (1);
+
+ int status = support_capture_subprogram_self_sgid (buf);
+
+ /* Bail out early if unsupported. */
+ if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
+ return EXIT_UNSUPPORTED;
+
+ ret |= status;
+ }
+ return ret;
+ }
}
+
+#define TEST_FUNCTION_ARGV do_test
+#include <support/test-driver.c>
diff --git a/elf/tst-env-setuid.c b/elf/tst-env-setuid.c
index a080d2fa1c..2350976f8c 100644
--- a/elf/tst-env-setuid.c
+++ b/elf/tst-env-setuid.c
@@ -29,173 +29,12 @@
#include <sys/wait.h>
#include <unistd.h>
+#include <support/check.h>
#include <support/support.h>
#include <support/test-driver.h>
+#include <support/capture_subprocess.h>
static char SETGID_CHILD[] = "setgid-child";
-#define CHILD_STATUS 42
-
-/* Return a GID which is not our current GID, but is present in the
- supplementary group list. */
-static gid_t
-choose_gid (void)
-{
- const int count = 64;
- gid_t groups[count];
- int ret = getgroups (count, groups);
- if (ret < 0)
- {
- printf ("getgroups: %m\n");
- exit (1);
- }
- gid_t current = getgid ();
- for (int i = 0; i < ret; ++i)
- {
- if (groups[i] != current)
- return groups[i];
- }
- return 0;
-}
-
-/* Spawn and execute a program and verify that it returns the CHILD_STATUS. */
-static pid_t
-do_execve (char **args)
-{
- pid_t kid = vfork ();
-
- if (kid < 0)
- {
- printf ("vfork: %m\n");
- return -1;
- }
-
- if (kid == 0)
- {
- /* Child process. */
- execve (args[0], args, environ);
- _exit (-errno);
- }
-
- if (kid < 0)
- return 1;
-
- int status;
-
- if (waitpid (kid, &status, 0) < 0)
- {
- printf ("waitpid: %m\n");
- return 1;
- }
-
- if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
- return EXIT_UNSUPPORTED;
-
- if (!WIFEXITED (status) || WEXITSTATUS (status) != CHILD_STATUS)
- {
- printf ("Unexpected exit status %d from child process\n",
- WEXITSTATUS (status));
- return 1;
- }
- return 0;
-}
-
-/* Copies the executable into a restricted directory, so that we can
- safely make it SGID with the TARGET group ID. Then runs the
- executable. */
-static int
-run_executable_sgid (gid_t target)
-{
- char *dirname = xasprintf ("%s/tst-tunables-setuid.%jd",
- test_dir, (intmax_t) getpid ());
- char *execname = xasprintf ("%s/bin", dirname);
- int infd = -1;
- int outfd = -1;
- int ret = 0;
- if (mkdir (dirname, 0700) < 0)
- {
- printf ("mkdir: %m\n");
- goto err;
- }
- infd = open ("/proc/self/exe", O_RDONLY);
- if (infd < 0)
- {
- printf ("open (/proc/self/exe): %m\n");
- goto err;
- }
- outfd = open (execname, O_WRONLY | O_CREAT | O_EXCL, 0700);
- if (outfd < 0)
- {
- printf ("open (%s): %m\n", execname);
- goto err;
- }
- char buf[4096];
- for (;;)
- {
- ssize_t rdcount = read (infd, buf, sizeof (buf));
- if (rdcount < 0)
- {
- printf ("read: %m\n");
- goto err;
- }
- if (rdcount == 0)
- break;
- char *p = buf;
- char *end = buf + rdcount;
- while (p != end)
- {
- ssize_t wrcount = write (outfd, buf, end - p);
- if (wrcount == 0)
- errno = ENOSPC;
- if (wrcount <= 0)
- {
- printf ("write: %m\n");
- goto err;
- }
- p += wrcount;
- }
- }
- if (fchown (outfd, getuid (), target) < 0)
- {
- printf ("fchown (%s): %m\n", execname);
- goto err;
- }
- if (fchmod (outfd, 02750) < 0)
- {
- printf ("fchmod (%s): %m\n", execname);
- goto err;
- }
- if (close (outfd) < 0)
- {
- printf ("close (outfd): %m\n");
- goto err;
- }
- if (close (infd) < 0)
- {
- printf ("close (infd): %m\n");
- goto err;
- }
-
- char *args[] = {execname, SETGID_CHILD, NULL};
-
- ret = do_execve (args);
-
-err:
- if (outfd >= 0)
- close (outfd);
- if (infd >= 0)
- close (infd);
- if (execname)
- {
- unlink (execname);
- free (execname);
- }
- if (dirname)
- {
- rmdir (dirname);
- free (dirname);
- }
- return ret;
-}
#ifndef test_child
static int
@@ -256,40 +95,32 @@ do_test (int argc, char **argv)
if (argc == 2 && strcmp (argv[1], SETGID_CHILD) == 0)
{
if (getgid () == getegid ())
- {
- /* This can happen if the file system is mounted nosuid. */
- fprintf (stderr, "SGID failed: GID and EGID match (%jd)\n",
- (intmax_t) getgid ());
- exit (EXIT_UNSUPPORTED);
- }
+ /* This can happen if the file system is mounted nosuid. */
+ FAIL_UNSUPPORTED ("SGID failed: GID and EGID match (%jd)\n",
+ (intmax_t) getgid ());
int ret = test_child ();
if (ret != 0)
exit (1);
- exit (CHILD_STATUS);
+ exit (EXIT_SUCCESS);
}
else
{
if (test_parent () != 0)
exit (1);
- /* Try running a setgid program. */
- gid_t target = choose_gid ();
- if (target == 0)
- {
- fprintf (stderr,
- "Could not find a suitable GID for user %jd, skipping test\n",
- (intmax_t) getuid ());
- exit (0);
- }
+ int status = support_capture_subprogram_self_sgid (SETGID_CHILD);
- return run_executable_sgid (target);
- }
+ if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
+ return EXIT_UNSUPPORTED;
+
+ if (!WIFEXITED (status))
+ FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status);
- /* Something went wrong and our argv was corrupted. */
- _exit (1);
+ return 0;
+ }
}
#define TEST_FUNCTION_ARGV do_test
diff --git a/stdlib/tst-secure-getenv.c b/stdlib/tst-secure-getenv.c
index 94de199530..ab793ae4f8 100644
--- a/stdlib/tst-secure-getenv.c
+++ b/stdlib/tst-secure-getenv.c
@@ -30,167 +30,12 @@
#include <sys/wait.h>
#include <unistd.h>
+#include <support/check.h>
#include <support/support.h>
+#include <support/capture_subprocess.h>
#include <support/test-driver.h>
static char MAGIC_ARGUMENT[] = "run-actual-test";
-#define MAGIC_STATUS 19
-
-/* Return a GID which is not our current GID, but is present in the
- supplementary group list. */
-static gid_t
-choose_gid (void)
-{
- int count = getgroups (0, NULL);
- if (count < 0)
- {
- printf ("getgroups: %m\n");
- exit (1);
- }
- gid_t *groups;
- groups = xcalloc (count, sizeof (*groups));
- int ret = getgroups (count, groups);
- if (ret < 0)
- {
- printf ("getgroups: %m\n");
- exit (1);
- }
- gid_t current = getgid ();
- gid_t not_current = 0;
- for (int i = 0; i < ret; ++i)
- {
- if (groups[i] != current)
- {
- not_current = groups[i];
- break;
- }
- }
- free (groups);
- return not_current;
-}
-
-
-/* Copies the executable into a restricted directory, so that we can
- safely make it SGID with the TARGET group ID. Then runs the
- executable. */
-static int
-run_executable_sgid (gid_t target)
-{
- char *dirname = xasprintf ("%s/secure-getenv.%jd",
- test_dir, (intmax_t) getpid ());
- char *execname = xasprintf ("%s/bin", dirname);
- int infd = -1;
- int outfd = -1;
- int ret = -1;
- if (mkdir (dirname, 0700) < 0)
- {
- printf ("mkdir: %m\n");
- goto err;
- }
- infd = open ("/proc/self/exe", O_RDONLY);
- if (infd < 0)
- {
- printf ("open (/proc/self/exe): %m\n");
- goto err;
- }
- outfd = open (execname, O_WRONLY | O_CREAT | O_EXCL, 0700);
- if (outfd < 0)
- {
- printf ("open (%s): %m\n", execname);
- goto err;
- }
- char buf[4096];
- for (;;)
- {
- ssize_t rdcount = read (infd, buf, sizeof (buf));
- if (rdcount < 0)
- {
- printf ("read: %m\n");
- goto err;
- }
- if (rdcount == 0)
- break;
- char *p = buf;
- char *end = buf + rdcount;
- while (p != end)
- {
- ssize_t wrcount = write (outfd, buf, end - p);
- if (wrcount == 0)
- errno = ENOSPC;
- if (wrcount <= 0)
- {
- printf ("write: %m\n");
- goto err;
- }
- p += wrcount;
- }
- }
- if (fchown (outfd, getuid (), target) < 0)
- {
- printf ("fchown (%s): %m\n", execname);
- goto err;
- }
- if (fchmod (outfd, 02750) < 0)
- {
- printf ("fchmod (%s): %m\n", execname);
- goto err;
- }
- if (close (outfd) < 0)
- {
- printf ("close (outfd): %m\n");
- goto err;
- }
- if (close (infd) < 0)
- {
- printf ("close (infd): %m\n");
- goto err;
- }
-
- int kid = fork ();
- if (kid < 0)
- {
- printf ("fork: %m\n");
- goto err;
- }
- if (kid == 0)
- {
- /* Child process. */
- char *args[] = { execname, MAGIC_ARGUMENT, NULL };
- execve (execname, args, environ);
- printf ("execve (%s): %m\n", execname);
- _exit (1);
- }
- int status;
- if (waitpid (kid, &status, 0) < 0)
- {
- printf ("waitpid: %m\n");
- goto err;
- }
- if (!WIFEXITED (status) || WEXITSTATUS (status) != MAGIC_STATUS)
- {
- printf ("Unexpected exit status %d from child process\n",
- status);
- goto err;
- }
- ret = 0;
-
-err:
- if (outfd >= 0)
- close (outfd);
- if (infd >= 0)
- close (infd);
- if (execname)
- {
- unlink (execname);
- free (execname);
- }
- if (dirname)
- {
- rmdir (dirname);
- free (dirname);
- }
- return ret;
-}
static int
do_test (void)
@@ -212,15 +57,15 @@ do_test (void)
exit (1);
}
- gid_t target = choose_gid ();
- if (target == 0)
- {
- fprintf (stderr,
- "Could not find a suitable GID for user %jd, skipping test\n",
- (intmax_t) getuid ());
- exit (0);
- }
- return run_executable_sgid (target);
+ int status = support_capture_subprogram_self_sgid (MAGIC_ARGUMENT);
+
+ if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
+ return EXIT_UNSUPPORTED;
+
+ if (!WIFEXITED (status))
+ FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status);
+
+ return 0;
}
static void
@@ -229,23 +74,15 @@ alternative_main (int argc, char **argv)
if (argc == 2 && strcmp (argv[1], MAGIC_ARGUMENT) == 0)
{
if (getgid () == getegid ())
- {
- /* This can happen if the file system is mounted nosuid. */
- fprintf (stderr, "SGID failed: GID and EGID match (%jd)\n",
- (intmax_t) getgid ());
- exit (MAGIC_STATUS);
- }
+ /* This can happen if the file system is mounted nosuid. */
+ FAIL_UNSUPPORTED ("SGID failed: GID and EGID match (%jd)\n",
+ (intmax_t) getgid ());
if (getenv ("PATH") == NULL)
- {
- printf ("PATH variable not present\n");
- exit (3);
- }
+ FAIL_EXIT (3, "PATH variable not present\n");
if (secure_getenv ("PATH") != NULL)
- {
- printf ("PATH variable not filtered out\n");
- exit (4);
- }
- exit (MAGIC_STATUS);
+ FAIL_EXIT (4, "PATH variable not filtered out\n");
+
+ exit (EXIT_SUCCESS);
}
}
diff --git a/string/test-strnlen.c b/string/test-strnlen.c
index 8c753382a1..f7d0896f47 100644
--- a/string/test-strnlen.c
+++ b/string/test-strnlen.c
@@ -27,6 +27,7 @@
#ifndef WIDE
# define STRNLEN strnlen
+# define MEMSET memset
# define CHAR char
# define BIG_CHAR CHAR_MAX
# define MIDDLE_CHAR 127
@@ -34,6 +35,7 @@
#else
# include <wchar.h>
# define STRNLEN wcsnlen
+# define MEMSET wmemset
# define CHAR wchar_t
# define BIG_CHAR WCHAR_MAX
# define MIDDLE_CHAR 1121
@@ -153,7 +155,7 @@ do_page_tests (void)
size_t last_offset = (page_size / sizeof (CHAR)) - 1;
CHAR *s = (CHAR *) buf2;
- memset (s, 65, (last_offset - 1));
+ MEMSET (s, 65, (last_offset - 1));
s[last_offset] = 0;
/* Place short strings ending at page boundary. */
@@ -196,6 +198,35 @@ do_page_tests (void)
}
}
+/* Tests meant to unveil fail on implementations that access bytes
+ beyond the maxium length. */
+
+static void
+do_page_2_tests (void)
+{
+ size_t i, exp_len, offset;
+ size_t last_offset = page_size / sizeof (CHAR);
+
+ CHAR *s = (CHAR *) buf2;
+ MEMSET (s, 65, last_offset);
+
+ /* Place short strings ending at page boundary without the null
+ byte. */
+ offset = last_offset;
+ for (i = 0; i < 128; i++)
+ {
+ /* Decrease offset to stress several sizes and alignments. */
+ offset--;
+ exp_len = last_offset - offset;
+ FOR_EACH_IMPL (impl, 0)
+ {
+ /* If an implementation goes beyond EXP_LEN, it will trigger
+ the segfault. */
+ do_one_test (impl, (CHAR *) (s + offset), exp_len, exp_len);
+ }
+ }
+}
+
int
test_main (void)
{
@@ -242,6 +273,7 @@ test_main (void)
do_random_tests ();
do_page_tests ();
+ do_page_2_tests ();
return ret;
}
diff --git a/support/capture_subprocess.h b/support/capture_subprocess.h
index 2832cfc635..e42a84e70e 100644
--- a/support/capture_subprocess.h
+++ b/support/capture_subprocess.h
@@ -41,6 +41,12 @@ struct support_capture_subprocess support_capture_subprocess
struct support_capture_subprocess support_capture_subprogram
(const char *file, char *const argv[]);
+/* Copy the running program into a setgid binary and run it with CHILD_ID
+ argument. If execution is successful, return the exit status of the child
+ program, otherwise return a non-zero failure exit code. */
+int support_capture_subprogram_self_sgid
+ (char *child_id);
+
/* Deallocate the subprocess data captured by
support_capture_subprocess. */
void support_capture_subprocess_free (struct support_capture_subprocess *);
diff --git a/support/subprocess.h b/support/subprocess.h
index c031878d94..a19335ee5d 100644
--- a/support/subprocess.h
+++ b/support/subprocess.h
@@ -38,6 +38,11 @@ struct support_subprocess support_subprocess
struct support_subprocess support_subprogram
(const char *file, char *const argv[]);
+/* Invoke program FILE with ARGV arguments by using posix_spawn and wait for it
+ to complete. Return program exit status. */
+int support_subprogram_wait
+ (const char *file, char *const argv[]);
+
/* Wait for the subprocess indicated by PROC::PID. Return the status
indicate by waitpid call. */
int support_process_wait (struct support_subprocess *proc);
diff --git a/support/support_capture_subprocess.c b/support/support_capture_subprocess.c
index 948ce5a0c6..492b76d114 100644
--- a/support/support_capture_subprocess.c
+++ b/support/support_capture_subprocess.c
@@ -20,11 +20,14 @@
#include <support/capture_subprocess.h>
#include <errno.h>
+#include <fcntl.h>
#include <stdlib.h>
#include <support/check.h>
#include <support/xunistd.h>
#include <support/xsocket.h>
#include <support/xspawn.h>
+#include <support/support.h>
+#include <support/test-driver.h>
static void
transfer (const char *what, struct pollfd *pfd, struct xmemstream *stream)
@@ -36,7 +39,7 @@ transfer (const char *what, struct pollfd *pfd, struct xmemstream *stream)
if (ret < 0)
{
support_record_failure ();
- printf ("error: reading from subprocess %s: %m", what);
+ printf ("error: reading from subprocess %s: %m\n", what);
pfd->events = 0;
pfd->revents = 0;
}
@@ -102,6 +105,129 @@ support_capture_subprogram (const char *file, char *const argv[])
return result;
}
+/* Copies the executable into a restricted directory, so that we can
+ safely make it SGID with the TARGET group ID. Then runs the
+ executable. */
+static int
+copy_and_spawn_sgid (char *child_id, gid_t gid)
+{
+ char *dirname = xasprintf ("%s/tst-tunables-setuid.%jd",
+ test_dir, (intmax_t) getpid ());
+ char *execname = xasprintf ("%s/bin", dirname);
+ int infd = -1;
+ int outfd = -1;
+ int ret = 1, status = 1;
+
+ TEST_VERIFY (mkdir (dirname, 0700) == 0);
+ if (support_record_failure_is_failed ())
+ goto err;
+
+ infd = open ("/proc/self/exe", O_RDONLY);
+ if (infd < 0)
+ FAIL_UNSUPPORTED ("unsupported: Cannot read binary from procfs\n");
+
+ outfd = open (execname, O_WRONLY | O_CREAT | O_EXCL, 0700);
+ TEST_VERIFY (outfd >= 0);
+ if (support_record_failure_is_failed ())
+ goto err;
+
+ char buf[4096];
+ for (;;)
+ {
+ ssize_t rdcount = read (infd, buf, sizeof (buf));
+ TEST_VERIFY (rdcount >= 0);
+ if (support_record_failure_is_failed ())
+ goto err;
+ if (rdcount == 0)
+ break;
+ char *p = buf;
+ char *end = buf + rdcount;
+ while (p != end)
+ {
+ ssize_t wrcount = write (outfd, buf, end - p);
+ if (wrcount == 0)
+ errno = ENOSPC;
+ TEST_VERIFY (wrcount > 0);
+ if (support_record_failure_is_failed ())
+ goto err;
+ p += wrcount;
+ }
+ }
+ TEST_VERIFY (fchown (outfd, getuid (), gid) == 0);
+ if (support_record_failure_is_failed ())
+ goto err;
+ TEST_VERIFY (fchmod (outfd, 02750) == 0);
+ if (support_record_failure_is_failed ())
+ goto err;
+ TEST_VERIFY (close (outfd) == 0);
+ if (support_record_failure_is_failed ())
+ goto err;
+ TEST_VERIFY (close (infd) == 0);
+ if (support_record_failure_is_failed ())
+ goto err;
+
+ /* We have the binary, now spawn the subprocess. Avoid using
+ support_subprogram because we only want the program exit status, not the
+ contents. */
+ ret = 0;
+
+ char * const args[] = {execname, child_id, NULL};
+
+ status = support_subprogram_wait (args[0], args);
+
+err:
+ if (outfd >= 0)
+ close (outfd);
+ if (infd >= 0)
+ close (infd);
+ if (execname != NULL)
+ {
+ unlink (execname);
+ free (execname);
+ }
+ if (dirname != NULL)
+ {
+ rmdir (dirname);
+ free (dirname);
+ }
+
+ if (ret != 0)
+ FAIL_EXIT1("Failed to make sgid executable for test\n");
+
+ return status;
+}
+
+int
+support_capture_subprogram_self_sgid (char *child_id)
+{
+ gid_t target = 0;
+ const int count = 64;
+ gid_t groups[count];
+
+ /* Get a GID which is not our current GID, but is present in the
+ supplementary group list. */
+ int ret = getgroups (count, groups);
+ if (ret < 0)
+ FAIL_UNSUPPORTED("Could not get group list for user %jd\n",
+ (intmax_t) getuid ());
+
+ gid_t current = getgid ();
+ for (int i = 0; i < ret; ++i)
+ {
+ if (groups[i] != current)
+ {
+ target = groups[i];
+ break;
+ }
+ }
+
+ if (target == 0)
+ FAIL_UNSUPPORTED("Could not find a suitable GID for user %jd\n",
+ (intmax_t) getuid ());
+
+ return copy_and_spawn_sgid (child_id, target);
+}
+
void
support_capture_subprocess_free (struct support_capture_subprocess *p)
{
diff --git a/support/support_subprocess.c b/support/support_subprocess.c
index 0c8cc6af30..97e481e2d9 100644
--- a/support/support_subprocess.c
+++ b/support/support_subprocess.c
@@ -27,7 +27,7 @@
#include <support/subprocess.h>
static struct support_subprocess
-support_suprocess_init (void)
+support_subprocess_init (void)
{
struct support_subprocess result;
@@ -48,7 +48,7 @@ support_suprocess_init (void)
struct support_subprocess
support_subprocess (void (*callback) (void *), void *closure)
{
- struct support_subprocess result = support_suprocess_init ();
+ struct support_subprocess result = support_subprocess_init ();
result.pid = xfork ();
if (result.pid == 0)
@@ -71,7 +71,7 @@ support_subprocess (void (*callback) (void *), void *closure)
struct support_subprocess
support_subprogram (const char *file, char *const argv[])
{
- struct support_subprocess result = support_suprocess_init ();
+ struct support_subprocess result = support_subprocess_init ();
posix_spawn_file_actions_t fa;
/* posix_spawn_file_actions_init does not fail. */
@@ -84,7 +84,7 @@ support_subprogram (const char *file, char *const argv[])
xposix_spawn_file_actions_addclose (&fa, result.stdout_pipe[1]);
xposix_spawn_file_actions_addclose (&fa, result.stderr_pipe[1]);
- result.pid = xposix_spawn (file, &fa, NULL, argv, NULL);
+ result.pid = xposix_spawn (file, &fa, NULL, argv, environ);
xclose (result.stdout_pipe[1]);
xclose (result.stderr_pipe[1]);
@@ -93,6 +93,19 @@ support_subprogram (const char *file, char *const argv[])
}
int
+support_subprogram_wait (const char *file, char *const argv[])
+{
+ posix_spawn_file_actions_t fa;
+
+ posix_spawn_file_actions_init (&fa);
+ struct support_subprocess res = support_subprocess_init ();
+
+ res.pid = xposix_spawn (file, &fa, NULL, argv, environ);
+
+ return support_process_wait (&res);
+}
+
+int
support_process_wait (struct support_subprocess *proc)
{
xclose (proc->stdout_pipe[0]);
diff --git a/sysdeps/s390/configure b/sysdeps/s390/configure
index fa46e9e351..e7f576338d 100644
--- a/sysdeps/s390/configure
+++ b/sysdeps/s390/configure
@@ -123,7 +123,9 @@ void testinsn (char *buf)
__asm__ (".machine \"arch13\" \n\t"
".machinemode \"zarch_nohighgprs\" \n\t"
"lghi %%r0,16 \n\t"
- "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0");
+ "mvcrl 0(%0),32(%0) \n\t"
+ "vstrs %%v20,%%v20,%%v20,%%v20,0,2"
+ : : "a" (buf) : "memory", "r0");
}
EOF
if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS --shared conftest.c
@@ -271,7 +273,9 @@ else
void testinsn (char *buf)
{
__asm__ ("lghi %%r0,16 \n\t"
- "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0");
+ "mvcrl 0(%0),32(%0) \n\t"
+ "vstrs %%v20,%%v20,%%v20,%%v20,0,2"
+ : : "a" (buf) : "memory", "r0");
}
EOF
if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS --shared conftest.c
diff --git a/sysdeps/s390/configure.ac b/sysdeps/s390/configure.ac
index 3ed5a8ef87..5c3479e8cf 100644
--- a/sysdeps/s390/configure.ac
+++ b/sysdeps/s390/configure.ac
@@ -88,7 +88,9 @@ void testinsn (char *buf)
__asm__ (".machine \"arch13\" \n\t"
".machinemode \"zarch_nohighgprs\" \n\t"
"lghi %%r0,16 \n\t"
- "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0");
+ "mvcrl 0(%0),32(%0) \n\t"
+ "vstrs %%v20,%%v20,%%v20,%%v20,0,2"
+ : : "a" (buf) : "memory", "r0");
}
EOF
dnl test, if assembler supports S390 arch13 instructions
@@ -195,7 +197,9 @@ cat > conftest.c <<\EOF
void testinsn (char *buf)
{
__asm__ ("lghi %%r0,16 \n\t"
- "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0");
+ "mvcrl 0(%0),32(%0) \n\t"
+ "vstrs %%v20,%%v20,%%v20,%%v20,0,2"
+ : : "a" (buf) : "memory", "r0");
}
EOF
dnl test, if assembler supports S390 arch13 zarch instructions as default
diff --git a/sysdeps/s390/memmove.c b/sysdeps/s390/memmove.c
index fb6b69ae2f..dc27b1db4c 100644
--- a/sysdeps/s390/memmove.c
+++ b/sysdeps/s390/memmove.c
@@ -43,7 +43,7 @@ extern __typeof (__redirect_memmove) MEMMOVE_ARCH13 attribute_hidden;
s390_libc_ifunc_expr (__redirect_memmove, memmove,
({
s390_libc_ifunc_expr_stfle_init ();
- (HAVE_MEMMOVE_ARCH13
+ (HAVE_MEMMOVE_ARCH13 && (hwcap & HWCAP_S390_VXRS_EXT2)
&& S390_IS_ARCH13_MIE3 (stfle_bits))
? MEMMOVE_ARCH13
: (HAVE_MEMMOVE_Z13 && (hwcap & HWCAP_S390_VX))
diff --git a/sysdeps/s390/multiarch/ifunc-impl-list.c b/sysdeps/s390/multiarch/ifunc-impl-list.c
index 1948436417..6d05652c8a 100644
--- a/sysdeps/s390/multiarch/ifunc-impl-list.c
+++ b/sysdeps/s390/multiarch/ifunc-impl-list.c
@@ -171,7 +171,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, memmove,
# if HAVE_MEMMOVE_ARCH13
IFUNC_IMPL_ADD (array, i, memmove,
- S390_IS_ARCH13_MIE3 (stfle_bits),
+ ((dl_hwcap & HWCAP_S390_VXRS_EXT2)
+ && S390_IS_ARCH13_MIE3 (stfle_bits)),
MEMMOVE_ARCH13)
# endif
# if HAVE_MEMMOVE_Z13
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index a5112ef367..a93139b790 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -20,6 +20,33 @@ endif
endif
endif
+ifeq ($(subdir),string)
+sysdep_routines += cacheinfo
+
+tests += \
+ tst-memchr-rtm \
+ tst-memcmp-rtm \
+ tst-memmove-rtm \
+ tst-memrchr-rtm \
+ tst-memset-rtm \
+ tst-strchr-rtm \
+ tst-strcpy-rtm \
+ tst-strlen-rtm \
+ tst-strncmp-rtm \
+ tst-strrchr-rtm
+
+CFLAGS-tst-memchr-rtm.c += -mrtm
+CFLAGS-tst-memcmp-rtm.c += -mrtm
+CFLAGS-tst-memmove-rtm.c += -mrtm
+CFLAGS-tst-memrchr-rtm.c += -mrtm
+CFLAGS-tst-memset-rtm.c += -mrtm
+CFLAGS-tst-strchr-rtm.c += -mrtm
+CFLAGS-tst-strcpy-rtm.c += -mrtm
+CFLAGS-tst-strlen-rtm.c += -mrtm
+CFLAGS-tst-strncmp-rtm.c += -mrtm
+CFLAGS-tst-strrchr-rtm.c += -mrtm
+endif
+
ifeq ($(enable-cet),yes)
ifeq ($(subdir),elf)
sysdep-dl-routines += dl-cet
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 4bab154913..a4d1eacbe7 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -424,8 +424,24 @@ init_cpu_features (struct cpu_features *cpu_features)
cpu_features->feature[index_arch_Prefer_No_VZEROUPPER]
|= bit_arch_Prefer_No_VZEROUPPER;
else
- cpu_features->feature[index_arch_Prefer_No_AVX512]
- |= bit_arch_Prefer_No_AVX512;
+ {
+ cpu_features->feature[index_arch_Prefer_No_AVX512]
+ |= bit_arch_Prefer_No_AVX512;
+
+ /* Avoid RTM abort triggered by VZEROUPPER inside a
+ transactionally executing RTM region. */
+ if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ cpu_features->feature[index_arch_Prefer_No_VZEROUPPER]
+ |= bit_arch_Prefer_No_VZEROUPPER;
+
+ /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp
+ requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp
+ requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB,
+ AVX2 strcmp is faster than EVEX strcmp. */
+ if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable))
+ cpu_features->feature[index_arch_Prefer_AVX2_STRCMP]
+ |= bit_arch_Prefer_AVX2_STRCMP;
+ }
}
/* This spells out "AuthenticAMD" or "HygonGenuine". */
else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h
index 03a9b2a5e9..ca2924bd95 100644
--- a/sysdeps/x86/cpu-features.h
+++ b/sysdeps/x86/cpu-features.h
@@ -897,6 +897,7 @@ extern const struct cpu_features *__get_cpu_features (void)
#define bit_arch_Prefer_FSRM (1u << 13)
#define bit_arch_Prefer_No_AVX512 (1u << 14)
#define bit_arch_MathVec_Prefer_No_AVX512 (1u << 15)
+#define bit_arch_Prefer_AVX2_STRCMP (1u << 16)
#define index_arch_Fast_Rep_String FEATURE_INDEX_2
#define index_arch_Fast_Copy_Backward FEATURE_INDEX_2
@@ -914,6 +915,7 @@ extern const struct cpu_features *__get_cpu_features (void)
#define index_arch_Prefer_No_AVX512 FEATURE_INDEX_2
#define index_arch_MathVec_Prefer_No_AVX512 FEATURE_INDEX_2
#define index_arch_Prefer_FSRM FEATURE_INDEX_2
+#define index_arch_Prefer_AVX2_STRCMP FEATURE_INDEX_2
/* XCR0 Feature flags. */
#define bit_XMM_state (1u << 1)
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
index 2cb315111b..d4d5e450a6 100644
--- a/sysdeps/x86/cpu-tunables.c
+++ b/sysdeps/x86/cpu-tunables.c
@@ -282,6 +282,9 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
CHECK_GLIBC_IFUNC_ARCH_BOTH (n, cpu_features,
Fast_Copy_Backward, disable,
18);
+ CHECK_GLIBC_IFUNC_ARCH_NEED_ARCH_BOTH
+ (n, cpu_features, Prefer_AVX2_STRCMP, AVX2_Usable,
+ disable, 18);
}
break;
case 19:
diff --git a/sysdeps/x86/tst-memchr-rtm.c b/sysdeps/x86/tst-memchr-rtm.c
new file mode 100644
index 0000000000..e47494011e
--- /dev/null
+++ b/sysdeps/x86/tst-memchr-rtm.c
@@ -0,0 +1,54 @@
+/* Test case for memchr inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE);
+ string1[100] = 'c';
+ string1[STRING_SIZE - 100] = 'c';
+ char *p = memchr (string1, 'c', STRING_SIZE);
+ if (p == &string1[100])
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ char *p = memchr (string1, 'c', STRING_SIZE);
+ if (p == &string1[100])
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("memchr", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-memcmp-rtm.c b/sysdeps/x86/tst-memcmp-rtm.c
new file mode 100644
index 0000000000..e4c8a623bb
--- /dev/null
+++ b/sysdeps/x86/tst-memcmp-rtm.c
@@ -0,0 +1,52 @@
+/* Test case for memcmp inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE);
+ memset (string2, 'a', STRING_SIZE);
+ if (memcmp (string1, string2, STRING_SIZE) == 0)
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ if (memcmp (string1, string2, STRING_SIZE) == 0)
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("memcmp", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-memmove-rtm.c b/sysdeps/x86/tst-memmove-rtm.c
new file mode 100644
index 0000000000..4bf97ef1e3
--- /dev/null
+++ b/sysdeps/x86/tst-memmove-rtm.c
@@ -0,0 +1,53 @@
+/* Test case for memmove inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE);
+ if (memmove (string2, string1, STRING_SIZE) == string2
+ && memcmp (string2, string1, STRING_SIZE) == 0)
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ if (memmove (string2, string1, STRING_SIZE) == string2
+ && memcmp (string2, string1, STRING_SIZE) == 0)
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("memmove", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-memrchr-rtm.c b/sysdeps/x86/tst-memrchr-rtm.c
new file mode 100644
index 0000000000..a57a5a8eb9
--- /dev/null
+++ b/sysdeps/x86/tst-memrchr-rtm.c
@@ -0,0 +1,54 @@
+/* Test case for memrchr inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE);
+ string1[100] = 'c';
+ string1[STRING_SIZE - 100] = 'c';
+ char *p = memrchr (string1, 'c', STRING_SIZE);
+ if (p == &string1[STRING_SIZE - 100])
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ char *p = memrchr (string1, 'c', STRING_SIZE);
+ if (p == &string1[STRING_SIZE - 100])
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("memrchr", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-memset-rtm.c b/sysdeps/x86/tst-memset-rtm.c
new file mode 100644
index 0000000000..bf343a4dad
--- /dev/null
+++ b/sysdeps/x86/tst-memset-rtm.c
@@ -0,0 +1,45 @@
+/* Test case for memset inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE);
+ return EXIT_SUCCESS;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ memset (string1, 'a', STRING_SIZE);
+ return 0;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("memset", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-strchr-rtm.c b/sysdeps/x86/tst-strchr-rtm.c
new file mode 100644
index 0000000000..a82e29c072
--- /dev/null
+++ b/sysdeps/x86/tst-strchr-rtm.c
@@ -0,0 +1,54 @@
+/* Test case for strchr inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE - 1);
+ string1[100] = 'c';
+ string1[STRING_SIZE - 100] = 'c';
+ char *p = strchr (string1, 'c');
+ if (p == &string1[100])
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ char *p = strchr (string1, 'c');
+ if (p == &string1[100])
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("strchr", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-strcpy-rtm.c b/sysdeps/x86/tst-strcpy-rtm.c
new file mode 100644
index 0000000000..2b2a583fb4
--- /dev/null
+++ b/sysdeps/x86/tst-strcpy-rtm.c
@@ -0,0 +1,53 @@
+/* Test case for strcpy inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE - 1);
+ if (strcpy (string2, string1) == string2
+ && strcmp (string2, string1) == 0)
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ if (strcpy (string2, string1) == string2
+ && strcmp (string2, string1) == 0)
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("strcpy", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-string-rtm.h b/sysdeps/x86/tst-string-rtm.h
new file mode 100644
index 0000000000..6ed9eca017
--- /dev/null
+++ b/sysdeps/x86/tst-string-rtm.h
@@ -0,0 +1,72 @@
+/* Test string function in a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <string.h>
+#include <x86intrin.h>
+#include <cpu-features.h>
+#include <support/check.h>
+#include <support/test-driver.h>
+
+static int
+do_test_1 (const char *name, unsigned int loop, int (*prepare) (void),
+ int (*function) (void))
+{
+ if (!CPU_FEATURE_USABLE (RTM))
+ return EXIT_UNSUPPORTED;
+
+ int status = prepare ();
+ if (status != EXIT_SUCCESS)
+ return status;
+
+ unsigned int i;
+ unsigned int naborts = 0;
+ unsigned int failed = 0;
+ for (i = 0; i < loop; i++)
+ {
+ failed |= function ();
+ if (_xbegin() == _XBEGIN_STARTED)
+ {
+ failed |= function ();
+ _xend();
+ }
+ else
+ {
+ failed |= function ();
+ ++naborts;
+ }
+ }
+
+ if (failed)
+ FAIL_EXIT1 ("%s() failed", name);
+
+ if (naborts)
+ {
+ /* NB: Low single digit (<= 5%) noise-level aborts are normal for
+ TSX. */
+ double rate = 100 * ((double) naborts) / ((double) loop);
+ if (rate > 5)
+ FAIL_EXIT1 ("TSX abort rate: %.2f%% (%d out of %d)",
+ rate, naborts, loop);
+ }
+
+ return EXIT_SUCCESS;
+}
+
+static int do_test (void);
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86/tst-strlen-rtm.c b/sysdeps/x86/tst-strlen-rtm.c
new file mode 100644
index 0000000000..0dcf14db87
--- /dev/null
+++ b/sysdeps/x86/tst-strlen-rtm.c
@@ -0,0 +1,53 @@
+/* Test case for strlen inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE - 1);
+ string1[STRING_SIZE - 100] = '\0';
+ size_t len = strlen (string1);
+ if (len == STRING_SIZE - 100)
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ size_t len = strlen (string1);
+ if (len == STRING_SIZE - 100)
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("strlen", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
new file mode 100644
index 0000000000..236ad951b5
--- /dev/null
+++ b/sysdeps/x86/tst-strncmp-rtm.c
@@ -0,0 +1,52 @@
+/* Test case for strncmp inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE - 1);
+ memset (string2, 'a', STRING_SIZE - 1);
+ if (strncmp (string1, string2, STRING_SIZE) == 0)
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ if (strncmp (string1, string2, STRING_SIZE) == 0)
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("strncmp", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-strrchr-rtm.c b/sysdeps/x86/tst-strrchr-rtm.c
new file mode 100644
index 0000000000..e32bfaf5f5
--- /dev/null
+++ b/sysdeps/x86/tst-strrchr-rtm.c
@@ -0,0 +1,53 @@
+/* Test case for strrchr inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE - 1);
+ string1[STRING_SIZE - 100] = 'c';
+ char *p = strrchr (string1, 'c');
+ if (p == &string1[STRING_SIZE - 100])
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ char *p = strrchr (string1, 'c');
+ if (p == &string1[STRING_SIZE - 100])
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("strrchr", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index ec96365217..f2217b20d5 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -21,9 +21,11 @@
#ifdef USE_AS_WMEMCHR
# define MEMCHR wmemchr
# define PCMPEQ pcmpeqd
+# define CHAR_PER_VEC 4
#else
# define MEMCHR memchr
# define PCMPEQ pcmpeqb
+# define CHAR_PER_VEC 16
#endif
/* fast SSE2 version with using pmaxub and 64 byte loop */
@@ -33,15 +35,14 @@ ENTRY(MEMCHR)
movd %esi, %xmm1
mov %edi, %ecx
+#ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
+#endif
#ifdef USE_AS_WMEMCHR
test %RDX_LP, %RDX_LP
jz L(return_null)
- shl $2, %RDX_LP
#else
-# ifdef __ILP32__
- /* Clear the upper 32 bits. */
- movl %edx, %edx
-# endif
punpcklbw %xmm1, %xmm1
test %RDX_LP, %RDX_LP
jz L(return_null)
@@ -60,13 +61,16 @@ ENTRY(MEMCHR)
test %eax, %eax
jnz L(matches_1)
- sub $16, %rdx
+ sub $CHAR_PER_VEC, %rdx
jbe L(return_null)
add $16, %rdi
and $15, %ecx
and $-16, %rdi
+#ifdef USE_AS_WMEMCHR
+ shr $2, %ecx
+#endif
add %rcx, %rdx
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
jmp L(loop_prolog)
@@ -77,16 +81,21 @@ L(crosscache):
movdqa (%rdi), %xmm0
PCMPEQ %xmm1, %xmm0
-/* Check if there is a match. */
+ /* Check if there is a match. */
pmovmskb %xmm0, %eax
-/* Remove the leading bytes. */
+ /* Remove the leading bytes. */
sar %cl, %eax
test %eax, %eax
je L(unaligned_no_match)
-/* Check which byte is a match. */
+ /* Check which byte is a match. */
bsf %eax, %eax
-
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
add %rdi, %rax
add %rcx, %rax
@@ -94,15 +103,18 @@ L(crosscache):
.p2align 4
L(unaligned_no_match):
- /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
+ /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
"rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
possible addition overflow. */
neg %rcx
add $16, %rcx
+#ifdef USE_AS_WMEMCHR
+ shr $2, %ecx
+#endif
sub %rcx, %rdx
jbe L(return_null)
add $16, %rdi
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
.p2align 4
@@ -135,7 +147,7 @@ L(loop_prolog):
test $0x3f, %rdi
jz L(align64_loop)
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
movdqa (%rdi), %xmm0
@@ -167,11 +179,14 @@ L(loop_prolog):
mov %rdi, %rcx
and $-64, %rdi
and $63, %ecx
+#ifdef USE_AS_WMEMCHR
+ shr $2, %ecx
+#endif
add %rcx, %rdx
.p2align 4
L(align64_loop):
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
movdqa (%rdi), %xmm0
movdqa 16(%rdi), %xmm2
@@ -218,7 +233,7 @@ L(align64_loop):
.p2align 4
L(exit_loop):
- add $32, %edx
+ add $(CHAR_PER_VEC * 2), %edx
jle L(exit_loop_32)
movdqa (%rdi), %xmm0
@@ -238,7 +253,7 @@ L(exit_loop):
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L(matches32_1)
- sub $16, %edx
+ sub $CHAR_PER_VEC, %edx
jle L(return_null)
PCMPEQ 48(%rdi), %xmm1
@@ -250,13 +265,13 @@ L(exit_loop):
.p2align 4
L(exit_loop_32):
- add $32, %edx
+ add $(CHAR_PER_VEC * 2), %edx
movdqa (%rdi), %xmm0
PCMPEQ %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(matches_1)
- sub $16, %edx
+ sub $CHAR_PER_VEC, %edx
jbe L(return_null)
PCMPEQ 16(%rdi), %xmm1
@@ -293,7 +308,13 @@ L(matches32):
.p2align 4
L(matches_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
add %rdi, %rax
ret
@@ -301,7 +322,13 @@ L(matches_1):
.p2align 4
L(matches16_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
lea 16(%rdi, %rax), %rax
ret
@@ -309,7 +336,13 @@ L(matches16_1):
.p2align 4
L(matches32_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
lea 32(%rdi, %rax), %rax
ret
@@ -317,7 +350,13 @@ L(matches32_1):
.p2align 4
L(matches48_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
lea 48(%rdi, %rax), %rax
ret
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 395e432c09..da1446d731 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -43,7 +43,45 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
memmove-avx512-unaligned-erms \
memset-sse2-unaligned-erms \
memset-avx2-unaligned-erms \
- memset-avx512-unaligned-erms
+ memset-avx512-unaligned-erms \
+ memchr-avx2-rtm \
+ memcmp-avx2-movbe-rtm \
+ memmove-avx-unaligned-erms-rtm \
+ memrchr-avx2-rtm \
+ memset-avx2-unaligned-erms-rtm \
+ rawmemchr-avx2-rtm \
+ strchr-avx2-rtm \
+ strcmp-avx2-rtm \
+ strchrnul-avx2-rtm \
+ stpcpy-avx2-rtm \
+ stpncpy-avx2-rtm \
+ strcat-avx2-rtm \
+ strcpy-avx2-rtm \
+ strlen-avx2-rtm \
+ strncat-avx2-rtm \
+ strncmp-avx2-rtm \
+ strncpy-avx2-rtm \
+ strnlen-avx2-rtm \
+ strrchr-avx2-rtm \
+ memchr-evex \
+ memcmp-evex-movbe \
+ memmove-evex-unaligned-erms \
+ memrchr-evex \
+ memset-evex-unaligned-erms \
+ rawmemchr-evex \
+ stpcpy-evex \
+ stpncpy-evex \
+ strcat-evex \
+ strchr-evex \
+ strchrnul-evex \
+ strcmp-evex \
+ strcpy-evex \
+ strlen-evex \
+ strncat-evex \
+ strncmp-evex \
+ strncpy-evex \
+ strnlen-evex \
+ strrchr-evex
CFLAGS-varshift.c += -msse4
CFLAGS-strcspn-c.c += -msse4
CFLAGS-strpbrk-c.c += -msse4
@@ -59,8 +97,24 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
wcscpy-ssse3 wcscpy-c \
wcschr-sse2 wcschr-avx2 \
wcsrchr-sse2 wcsrchr-avx2 \
- wcsnlen-sse4_1 wcsnlen-c \
- wcslen-sse2 wcslen-avx2 wcsnlen-avx2
+ wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \
+ wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \
+ wcschr-avx2-rtm \
+ wcscmp-avx2-rtm \
+ wcslen-avx2-rtm \
+ wcsncmp-avx2-rtm \
+ wcsnlen-avx2-rtm \
+ wcsrchr-avx2-rtm \
+ wmemchr-avx2-rtm \
+ wmemcmp-avx2-movbe-rtm \
+ wcschr-evex \
+ wcscmp-evex \
+ wcslen-evex \
+ wcsncmp-evex \
+ wcsnlen-evex \
+ wcsrchr-evex \
+ wmemchr-evex \
+ wmemcmp-evex-movbe
endif
ifeq ($(subdir),debug)
diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
index a70b62d950..12cff8451f 100644
--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
+++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
@@ -21,16 +21,28 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
- && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+ if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
- return OPTIMIZE (avx2);
+ {
+ if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable)
+ && CPU_FEATURES_CPU_P (cpu_features, BMI2))
+ return OPTIMIZE (evex);
+
+ if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ return OPTIMIZE (avx2_rtm);
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ return OPTIMIZE (avx2);
+ }
return OPTIMIZE (sse2);
}
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 39a45d0742..e57fb42af3 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -43,6 +43,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memchr,
HAS_ARCH_FEATURE (AVX2_Usable),
__memchr_avx2)
+ IFUNC_IMPL_ADD (array, i, memchr,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memchr_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, memchr,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ && HAS_CPU_FEATURE (BMI2)),
+ __memchr_evex)
IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2))
/* Support sysdeps/x86_64/multiarch/memcmp.c. */
@@ -51,6 +60,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(HAS_ARCH_FEATURE (AVX2_Usable)
&& HAS_CPU_FEATURE (MOVBE)),
__memcmp_avx2_movbe)
+ IFUNC_IMPL_ADD (array, i, memcmp,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (MOVBE)
+ && HAS_CPU_FEATURE (RTM)),
+ __memcmp_avx2_movbe_rtm)
+ IFUNC_IMPL_ADD (array, i, memcmp,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ && HAS_CPU_FEATURE (MOVBE)),
+ __memcmp_evex_movbe)
IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_1),
__memcmp_sse4_1)
IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3),
@@ -64,10 +83,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memmove_chk_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
__memmove_chk_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
__memmove_chk_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
HAS_ARCH_FEATURE (AVX_Usable),
@@ -76,6 +95,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX_Usable),
__memmove_chk_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ (HAS_ARCH_FEATURE (AVX_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memmove_chk_avx_unaligned_rtm)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ (HAS_ARCH_FEATURE (AVX_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memmove_chk_avx_unaligned_erms_rtm)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
+ __memmove_chk_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
+ __memmove_chk_evex_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
HAS_CPU_FEATURE (SSSE3),
__memmove_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
@@ -98,13 +131,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX_Usable),
__memmove_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memmove,
+ (HAS_ARCH_FEATURE (AVX_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memmove_avx_unaligned_rtm)
+ IFUNC_IMPL_ADD (array, i, memmove,
+ (HAS_ARCH_FEATURE (AVX_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memmove_avx_unaligned_erms_rtm)
+ IFUNC_IMPL_ADD (array, i, memmove,
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
+ __memmove_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, memmove,
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
+ __memmove_evex_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, memmove,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memmove_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, memmove,
- HAS_ARCH_FEATURE (AVX512F_Usable),
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
__memmove_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, memmove,
- HAS_ARCH_FEATURE (AVX512F_Usable),
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
__memmove_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
__memmove_ssse3_back)
@@ -121,6 +168,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memrchr,
HAS_ARCH_FEATURE (AVX2_Usable),
__memrchr_avx2)
+ IFUNC_IMPL_ADD (array, i, memrchr,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memrchr_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, memrchr,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+ __memrchr_evex)
+
IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_sse2))
#ifdef SHARED
@@ -139,10 +195,28 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX2_Usable),
__memset_chk_avx2_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memset_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memset_chk_avx2_unaligned_rtm)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memset_chk_avx2_unaligned_erms_rtm)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+ __memset_chk_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+ __memset_chk_evex_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)),
__memset_chk_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memset_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)),
__memset_chk_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, __memset_chk,
HAS_ARCH_FEATURE (AVX512F_Usable),
@@ -164,10 +238,28 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX2_Usable),
__memset_avx2_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memset,
- HAS_ARCH_FEATURE (AVX512F_Usable),
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memset_avx2_unaligned_rtm)
+ IFUNC_IMPL_ADD (array, i, memset,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memset_avx2_unaligned_erms_rtm)
+ IFUNC_IMPL_ADD (array, i, memset,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+ __memset_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, memset,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+ __memset_evex_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, memset,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)),
__memset_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memset,
- HAS_ARCH_FEATURE (AVX512F_Usable),
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)),
__memset_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, memset,
HAS_ARCH_FEATURE (AVX512F_Usable),
@@ -179,20 +271,51 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, rawmemchr,
HAS_ARCH_FEATURE (AVX2_Usable),
__rawmemchr_avx2)
+ IFUNC_IMPL_ADD (array, i, rawmemchr,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __rawmemchr_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, rawmemchr,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ && HAS_CPU_FEATURE (BMI2)),
+ __rawmemchr_evex)
IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
/* Support sysdeps/x86_64/multiarch/strlen.c. */
IFUNC_IMPL (i, name, strlen,
IFUNC_IMPL_ADD (array, i, strlen,
- HAS_ARCH_FEATURE (AVX2_Usable),
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (BMI2)),
__strlen_avx2)
+ IFUNC_IMPL_ADD (array, i, strlen,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (BMI2)
+ && HAS_CPU_FEATURE (RTM)),
+ __strlen_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, strlen,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ && HAS_CPU_FEATURE (BMI2)),
+ __strlen_evex)
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
/* Support sysdeps/x86_64/multiarch/strnlen.c. */
IFUNC_IMPL (i, name, strnlen,
IFUNC_IMPL_ADD (array, i, strnlen,
- HAS_ARCH_FEATURE (AVX2_Usable),
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (BMI2)),
__strnlen_avx2)
+ IFUNC_IMPL_ADD (array, i, strnlen,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (BMI2)
+ && HAS_CPU_FEATURE (RTM)),
+ __strnlen_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, strnlen,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ && HAS_CPU_FEATURE (BMI2)),
+ __strnlen_evex)
IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
/* Support sysdeps/x86_64/multiarch/stpncpy.c. */
@@ -201,6 +324,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__stpncpy_ssse3)
IFUNC_IMPL_ADD (array, i, stpncpy, HAS_ARCH_FEATURE (AVX2_Usable),
__stpncpy_avx2)
+ IFUNC_IMPL_ADD (array, i, stpncpy,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __stpncpy_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, stpncpy,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+ __stpncpy_evex)
IFUNC_IMPL_ADD (array, i, stpncpy, 1,
__stpncpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2))
@@ -211,6 +342,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__stpcpy_ssse3)
IFUNC_IMPL_ADD (array, i, stpcpy, HAS_ARCH_FEATURE (AVX2_Usable),
__stpcpy_avx2)
+ IFUNC_IMPL_ADD (array, i, stpcpy,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __stpcpy_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, stpcpy,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+ __stpcpy_evex)
IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2))
@@ -245,6 +384,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, strcat,
IFUNC_IMPL_ADD (array, i, strcat, HAS_ARCH_FEATURE (AVX2_Usable),
__strcat_avx2)
+ IFUNC_IMPL_ADD (array, i, strcat,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __strcat_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, strcat,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+ __strcat_evex)
IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3),
__strcat_ssse3)
IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
@@ -255,6 +402,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strchr,
HAS_ARCH_FEATURE (AVX2_Usable),
__strchr_avx2)
+ IFUNC_IMPL_ADD (array, i, strchr,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __strchr_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, strchr,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ && HAS_CPU_FEATURE (BMI2)),
+ __strchr_evex)
IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
@@ -263,6 +419,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strchrnul,
HAS_ARCH_FEATURE (AVX2_Usable),
__strchrnul_avx2)
+ IFUNC_IMPL_ADD (array, i, strchrnul,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __strchrnul_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, strchrnul,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ && HAS_CPU_FEATURE (BMI2)),
+ __strchrnul_evex)
IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_sse2))
/* Support sysdeps/x86_64/multiarch/strrchr.c. */
@@ -270,6 +435,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strrchr,
HAS_ARCH_FEATURE (AVX2_Usable),
__strrchr_avx2)
+ IFUNC_IMPL_ADD (array, i, strrchr,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __strrchr_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, strrchr,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+ __strrchr_evex)
IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2))
/* Support sysdeps/x86_64/multiarch/strcmp.c. */
@@ -277,6 +450,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strcmp,
HAS_ARCH_FEATURE (AVX2_Usable),
__strcmp_avx2)
+ IFUNC_IMPL_ADD (array, i, strcmp,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __strcmp_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, strcmp,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ && HAS_CPU_FEATURE (BMI2)),
+ __strcmp_evex)
IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2),
__strcmp_sse42)
IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSSE3),
@@ -288,6 +470,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, strcpy,
IFUNC_IMPL_ADD (array, i, strcpy, HAS_ARCH_FEATURE (AVX2_Usable),
__strcpy_avx2)
+ IFUNC_IMPL_ADD (array, i, strcpy,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __strcpy_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, strcpy,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+ __strcpy_evex)
IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3),
__strcpy_ssse3)
IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
@@ -331,6 +521,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, strncat,
IFUNC_IMPL_ADD (array, i, strncat, HAS_ARCH_FEATURE (AVX2_Usable),
__strncat_avx2)
+ IFUNC_IMPL_ADD (array, i, strncat,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __strncat_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, strncat,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+ __strncat_evex)
IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3),
__strncat_ssse3)
IFUNC_IMPL_ADD (array, i, strncat, 1,
@@ -341,6 +539,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, strncpy,
IFUNC_IMPL_ADD (array, i, strncpy, HAS_ARCH_FEATURE (AVX2_Usable),
__strncpy_avx2)
+ IFUNC_IMPL_ADD (array, i, strncpy,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __strncpy_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, strncpy,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+ __strncpy_evex)
IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3),
__strncpy_ssse3)
IFUNC_IMPL_ADD (array, i, strncpy, 1,
@@ -370,6 +576,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, wcschr,
HAS_ARCH_FEATURE (AVX2_Usable),
__wcschr_avx2)
+ IFUNC_IMPL_ADD (array, i, wcschr,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __wcschr_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, wcschr,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ && HAS_CPU_FEATURE (BMI2)),
+ __wcschr_evex)
IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_sse2))
/* Support sysdeps/x86_64/multiarch/wcsrchr.c. */
@@ -377,6 +592,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, wcsrchr,
HAS_ARCH_FEATURE (AVX2_Usable),
__wcsrchr_avx2)
+ IFUNC_IMPL_ADD (array, i, wcsrchr,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __wcsrchr_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, wcsrchr,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ && HAS_CPU_FEATURE (BMI2)),
+ __wcsrchr_evex)
IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_sse2))
/* Support sysdeps/x86_64/multiarch/wcscmp.c. */
@@ -384,6 +608,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, wcscmp,
HAS_ARCH_FEATURE (AVX2_Usable),
__wcscmp_avx2)
+ IFUNC_IMPL_ADD (array, i, wcscmp,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __wcscmp_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, wcscmp,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ && HAS_CPU_FEATURE (BMI2)),
+ __wcscmp_evex)
IFUNC_IMPL_ADD (array, i, wcscmp, 1, __wcscmp_sse2))
/* Support sysdeps/x86_64/multiarch/wcsncmp.c. */
@@ -391,6 +624,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, wcsncmp,
HAS_ARCH_FEATURE (AVX2_Usable),
__wcsncmp_avx2)
+ IFUNC_IMPL_ADD (array, i, wcsncmp,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __wcsncmp_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, wcsncmp,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ && HAS_CPU_FEATURE (BMI2)),
+ __wcsncmp_evex)
IFUNC_IMPL_ADD (array, i, wcsncmp, 1, __wcsncmp_sse2))
/* Support sysdeps/x86_64/multiarch/wcscpy.c. */
@@ -402,16 +644,41 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/wcslen.c. */
IFUNC_IMPL (i, name, wcslen,
IFUNC_IMPL_ADD (array, i, wcslen,
- HAS_ARCH_FEATURE (AVX2_Usable),
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (BMI2)),
__wcslen_avx2)
+ IFUNC_IMPL_ADD (array, i, wcslen,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (BMI2)
+ && HAS_CPU_FEATURE (RTM)),
+ __wcslen_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, wcslen,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ && HAS_CPU_FEATURE (BMI2)),
+ __wcslen_evex)
+ IFUNC_IMPL_ADD (array, i, wcsnlen,
+ CPU_FEATURE_USABLE (SSE4_1),
+ __wcsnlen_sse4_1)
IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
/* Support sysdeps/x86_64/multiarch/wcsnlen.c. */
IFUNC_IMPL (i, name, wcsnlen,
IFUNC_IMPL_ADD (array, i, wcsnlen,
- HAS_ARCH_FEATURE (AVX2_Usable),
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (BMI2)),
__wcsnlen_avx2)
IFUNC_IMPL_ADD (array, i, wcsnlen,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (BMI2)
+ && HAS_CPU_FEATURE (RTM)),
+ __wcsnlen_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, wcsnlen,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ && HAS_CPU_FEATURE (BMI2)),
+ __wcsnlen_evex)
+ IFUNC_IMPL_ADD (array, i, wcsnlen,
HAS_CPU_FEATURE (SSE4_1),
__wcsnlen_sse4_1)
IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_sse2))
@@ -421,6 +688,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, wmemchr,
HAS_ARCH_FEATURE (AVX2_Usable),
__wmemchr_avx2)
+ IFUNC_IMPL_ADD (array, i, wmemchr,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __wmemchr_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, wmemchr,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ && HAS_CPU_FEATURE (BMI2)),
+ __wmemchr_evex)
IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2))
/* Support sysdeps/x86_64/multiarch/wmemcmp.c. */
@@ -429,6 +705,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(HAS_ARCH_FEATURE (AVX2_Usable)
&& HAS_CPU_FEATURE (MOVBE)),
__wmemcmp_avx2_movbe)
+ IFUNC_IMPL_ADD (array, i, wmemcmp,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (MOVBE)
+ && HAS_CPU_FEATURE (RTM)),
+ __wmemcmp_avx2_movbe_rtm)
+ IFUNC_IMPL_ADD (array, i, wmemcmp,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ && HAS_CPU_FEATURE (MOVBE)),
+ __wmemcmp_evex_movbe)
IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_1),
__wmemcmp_sse4_1)
IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3),
@@ -443,7 +729,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX2_Usable),
__wmemset_avx2_unaligned)
IFUNC_IMPL_ADD (array, i, wmemset,
- HAS_ARCH_FEATURE (AVX512F_Usable),
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __wmemset_avx2_unaligned_rtm)
+ IFUNC_IMPL_ADD (array, i, wmemset,
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
+ __wmemset_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, wmemset,
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
__wmemset_avx512_unaligned))
#ifdef SHARED
@@ -453,10 +746,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memcpy_chk_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
__memcpy_chk_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
__memcpy_chk_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
HAS_ARCH_FEATURE (AVX_Usable),
@@ -465,6 +758,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX_Usable),
__memcpy_chk_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ (HAS_ARCH_FEATURE (AVX_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memcpy_chk_avx_unaligned_rtm)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ (HAS_ARCH_FEATURE (AVX_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memcpy_chk_avx_unaligned_erms_rtm)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
+ __memcpy_chk_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
+ __memcpy_chk_evex_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
HAS_CPU_FEATURE (SSSE3),
__memcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
@@ -486,6 +793,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy,
HAS_ARCH_FEATURE (AVX_Usable),
__memcpy_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, memcpy,
+ (HAS_ARCH_FEATURE (AVX_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memcpy_avx_unaligned_rtm)
+ IFUNC_IMPL_ADD (array, i, memcpy,
+ (HAS_ARCH_FEATURE (AVX_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __memcpy_avx_unaligned_erms_rtm)
+ IFUNC_IMPL_ADD (array, i, memcpy,
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
+ __memcpy_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, memcpy,
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
+ __memcpy_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
__memcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
@@ -494,10 +815,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memcpy_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, memcpy,
- HAS_ARCH_FEATURE (AVX512F_Usable),
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
__memcpy_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, memcpy,
- HAS_ARCH_FEATURE (AVX512F_Usable),
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
__memcpy_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, memcpy, 1,
@@ -511,10 +832,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX512F_Usable),
__mempcpy_chk_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
__mempcpy_chk_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- HAS_ARCH_FEATURE (AVX512F_Usable),
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
__mempcpy_chk_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
HAS_ARCH_FEATURE (AVX_Usable),
@@ -523,6 +844,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX_Usable),
__mempcpy_chk_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ (HAS_ARCH_FEATURE (AVX_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __mempcpy_chk_avx_unaligned_rtm)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ (HAS_ARCH_FEATURE (AVX_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __mempcpy_chk_avx_unaligned_erms_rtm)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
+ __mempcpy_chk_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
+ __mempcpy_chk_evex_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
HAS_CPU_FEATURE (SSSE3),
__mempcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
@@ -542,10 +877,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX512F_Usable),
__mempcpy_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, mempcpy,
- HAS_ARCH_FEATURE (AVX512F_Usable),
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
__mempcpy_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, mempcpy,
- HAS_ARCH_FEATURE (AVX512F_Usable),
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
__mempcpy_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, mempcpy,
HAS_ARCH_FEATURE (AVX_Usable),
@@ -553,6 +888,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, mempcpy,
HAS_ARCH_FEATURE (AVX_Usable),
__mempcpy_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, mempcpy,
+ (HAS_ARCH_FEATURE (AVX_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __mempcpy_avx_unaligned_rtm)
+ IFUNC_IMPL_ADD (array, i, mempcpy,
+ (HAS_ARCH_FEATURE (AVX_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __mempcpy_avx_unaligned_erms_rtm)
+ IFUNC_IMPL_ADD (array, i, mempcpy,
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
+ __mempcpy_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, mempcpy,
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
+ __mempcpy_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
__mempcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
@@ -568,6 +917,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strncmp,
HAS_ARCH_FEATURE (AVX2_Usable),
__strncmp_avx2)
+ IFUNC_IMPL_ADD (array, i, strncmp,
+ (HAS_ARCH_FEATURE (AVX2_Usable)
+ && HAS_CPU_FEATURE (RTM)),
+ __strncmp_avx2_rtm)
+ IFUNC_IMPL_ADD (array, i, strncmp,
+ (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+ __strncmp_evex)
IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2),
__strncmp_sse42)
IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3),
@@ -583,6 +940,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX2_Usable),
__wmemset_chk_avx2_unaligned)
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+ HAS_ARCH_FEATURE (AVX512VL_Usable),
+ __wmemset_chk_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, __wmemset_chk,
HAS_ARCH_FEATURE (AVX512F_Usable),
__wmemset_chk_avx512_unaligned))
#endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
index 945529f1c9..963d3274ed 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -23,17 +23,28 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
- && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+ if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
&& CPU_FEATURES_CPU_P (cpu_features, MOVBE)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
- return OPTIMIZE (avx2_movbe);
+ {
+ if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable))
+ return OPTIMIZE (evex_movbe);
+
+ if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ return OPTIMIZE (avx2_movbe_rtm);
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ return OPTIMIZE (avx2_movbe);
+ }
if (CPU_FEATURES_CPU_P (cpu_features, SSE4_1))
return OPTIMIZE (sse4_1);
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index 9d63ee5de1..01fc6b998b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -29,6 +29,14 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_rtm)
+ attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms_rtm)
+ attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
+ attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
+ attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
@@ -48,21 +56,42 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable)
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
{
- if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
- return OPTIMIZE (avx512_no_vzeroupper);
+ if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable))
+ {
+ if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ return OPTIMIZE (avx512_unaligned_erms);
- if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
- return OPTIMIZE (avx512_unaligned_erms);
+ return OPTIMIZE (avx512_unaligned);
+ }
- return OPTIMIZE (avx512_unaligned);
+ return OPTIMIZE (avx512_no_vzeroupper);
}
if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
{
- if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
- return OPTIMIZE (avx_unaligned_erms);
+ if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable))
+ {
+ if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ return OPTIMIZE (evex_unaligned_erms);
+
+ return OPTIMIZE (evex_unaligned);
+ }
+
+ if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ {
+ if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ return OPTIMIZE (avx_unaligned_erms_rtm);
+
+ return OPTIMIZE (avx_unaligned_rtm);
+ }
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ {
+ if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ return OPTIMIZE (avx_unaligned_erms);
- return OPTIMIZE (avx_unaligned);
+ return OPTIMIZE (avx_unaligned);
+ }
}
if (!CPU_FEATURES_CPU_P (cpu_features, SSSE3)
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
index c98d7577fc..198c8c6ba5 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
@@ -27,6 +27,14 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms)
attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm)
+ attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms_rtm)
+ attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
+ attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
+ attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
@@ -45,21 +53,44 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable)
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
{
- if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
- return OPTIMIZE (avx512_no_vzeroupper);
+ if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable))
+ {
+ if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ return OPTIMIZE (avx512_unaligned_erms);
- if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
- return OPTIMIZE (avx512_unaligned_erms);
+ return OPTIMIZE (avx512_unaligned);
+ }
- return OPTIMIZE (avx512_unaligned);
+ return OPTIMIZE (avx512_no_vzeroupper);
}
if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable))
{
- if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
- return OPTIMIZE (avx2_unaligned_erms);
- else
- return OPTIMIZE (avx2_unaligned);
+ if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable))
+ {
+ if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ return OPTIMIZE (evex_unaligned_erms);
+
+ return OPTIMIZE (evex_unaligned);
+ }
+
+ if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ {
+ if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ return OPTIMIZE (avx2_unaligned_erms_rtm);
+
+ return OPTIMIZE (avx2_unaligned_rtm);
+ }
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ {
+ if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ return OPTIMIZE (avx2_unaligned_erms);
+
+ return OPTIMIZE (avx2_unaligned);
+ }
}
if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
index bc6a70fc7c..b05d119267 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
@@ -25,16 +25,27 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
- && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+ if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
- return OPTIMIZE (avx2);
+ {
+ if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable))
+ return OPTIMIZE (evex);
+
+ if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ return OPTIMIZE (avx2_rtm);
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ return OPTIMIZE (avx2);
+ }
if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
return OPTIMIZE (sse2_unaligned);
diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
new file mode 100644
index 0000000000..564cc8cbec
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
@@ -0,0 +1,52 @@
+/* Common definition for ifunc selections for wcslen and wcsnlen
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017-2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+ const struct cpu_features* cpu_features = __get_cpu_features ();
+
+ if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+ && CPU_FEATURES_CPU_P (cpu_features, BMI2)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ {
+ if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable))
+ return OPTIMIZE (evex);
+
+ if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ return OPTIMIZE (avx2_rtm);
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ return OPTIMIZE (avx2);
+ }
+
+ if (CPU_FEATURES_CPU_P (cpu_features, SSE4_1))
+ return OPTIMIZE (sse4_1);
+
+ return OPTIMIZE (sse2);
+}
diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
index f480b82288..2913a365d8 100644
--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
@@ -20,6 +20,9 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm)
+ attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
static inline void *
@@ -27,14 +30,21 @@ IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
- && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+ if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
{
- if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable)
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
- return OPTIMIZE (avx512_unaligned);
- else
+ if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable))
+ {
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+ return OPTIMIZE (avx512_unaligned);
+
+ return OPTIMIZE (evex_unaligned);
+ }
+
+ if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ return OPTIMIZE (avx2_unaligned_rtm);
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
return OPTIMIZE (avx2_unaligned);
}
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
new file mode 100644
index 0000000000..87b076c7c4
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef MEMCHR
+# define MEMCHR __memchr_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "memchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index cfec1657b6..9ddeab2c66 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -26,319 +26,407 @@
# ifdef USE_AS_WMEMCHR
# define VPCMPEQ vpcmpeqd
+# define VPBROADCAST vpbroadcastd
+# define CHAR_SIZE 4
# else
# define VPCMPEQ vpcmpeqb
+# define VPBROADCAST vpbroadcastb
+# define CHAR_SIZE 1
+# endif
+
+# ifdef USE_AS_RAWMEMCHR
+# define ERAW_PTR_REG ecx
+# define RRAW_PTR_REG rcx
+# define ALGN_PTR_REG rdi
+# else
+# define ERAW_PTR_REG edi
+# define RRAW_PTR_REG rdi
+# define ALGN_PTR_REG rcx
# endif
# ifndef VZEROUPPER
# define VZEROUPPER vzeroupper
# endif
+# ifndef SECTION
+# define SECTION(p) p##.avx
+# endif
+
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
- .section .text.avx,"ax",@progbits
+ .section SECTION(.text),"ax",@progbits
ENTRY (MEMCHR)
# ifndef USE_AS_RAWMEMCHR
/* Check for zero length. */
+# ifdef __ILP32__
+ /* Clear upper bits. */
+ and %RDX_LP, %RDX_LP
+# else
test %RDX_LP, %RDX_LP
+# endif
jz L(null)
# endif
- movl %edi, %ecx
- /* Broadcast CHAR to YMM0. */
+ /* Broadcast CHAR to YMMMATCH. */
vmovd %esi, %xmm0
-# ifdef USE_AS_WMEMCHR
- shl $2, %RDX_LP
- vpbroadcastd %xmm0, %ymm0
-# else
-# ifdef __ILP32__
- /* Clear the upper 32 bits. */
- movl %edx, %edx
-# endif
- vpbroadcastb %xmm0, %ymm0
-# endif
+ VPBROADCAST %xmm0, %ymm0
/* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ movl %edi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page_boundary)
/* Check the first VEC_SIZE bytes. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
+ VPCMPEQ (%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
- testl %eax, %eax
-
# ifndef USE_AS_RAWMEMCHR
- jnz L(first_vec_x0_check)
- /* Adjust length and check the end of data. */
- subq $VEC_SIZE, %rdx
- jbe L(zero)
-# else
- jnz L(first_vec_x0)
+ /* If length < CHAR_PER_VEC handle special. */
+ cmpq $CHAR_PER_VEC, %rdx
+ jbe L(first_vec_x0)
# endif
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
+ testl %eax, %eax
+ jz L(aligned_more)
+ tzcntl %eax, %eax
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
- addq %rcx, %rdx
+ .p2align 5
+L(first_vec_x0):
+ /* Check if first match was before length. */
+ tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %edx
+# endif
+ xorl %ecx, %ecx
+ cmpl %eax, %edx
+ leaq (%rdi, %rax), %rax
+ cmovle %rcx, %rax
+ VZEROUPPER_RETURN
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
+L(null):
+ xorl %eax, %eax
+ ret
# endif
- jmp L(more_4x_vec)
-
.p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
- VPCMPEQ (%rdi), %ymm0, %ymm1
+L(cross_page_boundary):
+ /* Save pointer before aligning as its original value is
+ necessary for computer return address if byte is found or
+ adjusting length if it is not and this is memchr. */
+ movq %rdi, %rcx
+ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
+ and rdi for rawmemchr. */
+ orq $(VEC_SIZE - 1), %ALGN_PTR_REG
+ VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Calculate length until end of page (length checked for a
+ match). */
+ leaq 1(%ALGN_PTR_REG), %rsi
+ subq %RRAW_PTR_REG, %rsi
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get wchar_t count. */
+ shrl $2, %esi
+# endif
+# endif
/* Remove the leading bytes. */
- sarl %cl, %eax
- testl %eax, %eax
- jz L(aligned_more)
- tzcntl %eax, %eax
+ sarxl %ERAW_PTR_REG, %eax, %eax
# ifndef USE_AS_RAWMEMCHR
/* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
+ cmpq %rsi, %rdx
+ jbe L(first_vec_x0)
# endif
+ testl %eax, %eax
+ jz L(cross_page_continue)
+ tzcntl %eax, %eax
+ addq %RRAW_PTR_REG, %rax
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
+
+ .p2align 4
+L(first_vec_x1):
+ tzcntl %eax, %eax
+ incq %rdi
addq %rdi, %rax
- addq %rcx, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
-L(aligned_more):
-# ifndef USE_AS_RAWMEMCHR
- /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
- instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
- overflow. */
- negq %rcx
- addq $VEC_SIZE, %rcx
+L(first_vec_x2):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE + 1), %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
- /* Check the end of data. */
- subq %rcx, %rdx
- jbe L(zero)
-# endif
+ .p2align 4
+L(first_vec_x3):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE * 2 + 1), %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
- addq $VEC_SIZE, %rdi
-# ifndef USE_AS_RAWMEMCHR
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
+ .p2align 4
+L(first_vec_x4):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE * 3 + 1), %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
-L(more_4x_vec):
+ .p2align 4
+L(aligned_more):
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
since data is only aligned to VEC_SIZE. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+# ifndef USE_AS_RAWMEMCHR
+L(cross_page_continue):
+ /* Align data to VEC_SIZE - 1. */
+ xorl %ecx, %ecx
+ subl %edi, %ecx
+ orq $(VEC_SIZE - 1), %rdi
+ /* esi is for adjusting length to see if near the end. */
+ leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %esi
+# endif
+# else
+ orq $(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
+# endif
+ /* Load first VEC regardless. */
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Adjust length. If near end handle specially. */
+ subq %rsi, %rdx
+ jbe L(last_4x_vec_or_less)
+# endif
testl %eax, %eax
jnz L(first_vec_x1)
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x2)
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x3)
- addq $(VEC_SIZE * 4), %rdi
-
-# ifndef USE_AS_RAWMEMCHR
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
-
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
- andq $-(4 * VEC_SIZE), %rdi
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x4)
# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
+ /* Check if at last VEC_SIZE * 4 length. */
+ subq $(CHAR_PER_VEC * 4), %rdx
+ jbe L(last_4x_vec_or_less_cmpeq)
+ /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
+ length. */
+ incq %rdi
+ movl %edi, %ecx
+ orq $(VEC_SIZE * 4 - 1), %rdi
+ andl $(VEC_SIZE * 4 - 1), %ecx
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %ecx
+# endif
addq %rcx, %rdx
+# else
+ /* Align data to VEC_SIZE * 4 - 1 for loop. */
+ incq %rdi
+ orq $(VEC_SIZE * 4 - 1), %rdi
# endif
+ /* Compare 4 * VEC at a time forward. */
.p2align 4
L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
-
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
vpor %ymm1, %ymm2, %ymm5
vpor %ymm3, %ymm4, %ymm6
vpor %ymm5, %ymm6, %ymm5
- vpmovmskb %ymm5, %eax
- testl %eax, %eax
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
-
+ vpmovmskb %ymm5, %ecx
# ifdef USE_AS_RAWMEMCHR
- jmp L(loop_4x_vec)
+ subq $-(VEC_SIZE * 4), %rdi
+ testl %ecx, %ecx
+ jz L(loop_4x_vec)
# else
- subq $(VEC_SIZE * 4), %rdx
- ja L(loop_4x_vec)
+ testl %ecx, %ecx
+ jnz L(loop_4x_vec_end)
-L(last_4x_vec_or_less):
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
- addl $(VEC_SIZE * 2), %edx
- jle L(last_2x_vec)
+ subq $-(VEC_SIZE * 4), %rdi
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
+ subq $(CHAR_PER_VEC * 4), %rdx
+ ja L(loop_4x_vec)
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+ /* Fall through into less than 4 remaining vectors of length
+ case. */
+ VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+ .p2align 4
+L(last_4x_vec_or_less):
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %edx
+# endif
+ /* Check if first VEC contained match. */
testl %eax, %eax
- jnz L(first_vec_x1)
+ jnz L(first_vec_x1_check)
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
+ /* If remaining length > VEC_SIZE * 2. */
+ addl $(VEC_SIZE * 2), %edx
+ jg L(last_4x_vec)
- jnz L(first_vec_x2_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
+L(last_2x_vec):
+ /* If remaining length < VEC_SIZE. */
+ addl $VEC_SIZE, %edx
+ jle L(zero_end)
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ /* Check VEC2 and compare any match with remaining length. */
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
- testl %eax, %eax
-
- jnz L(first_vec_x3_check)
- xorl %eax, %eax
- VZEROUPPER
- ret
+ tzcntl %eax, %eax
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ addq $(VEC_SIZE + 1), %rdi
+ addq %rdi, %rax
+L(zero_end):
+ VZEROUPPER_RETURN
.p2align 4
-L(last_2x_vec):
- addl $(VEC_SIZE * 2), %edx
- VPCMPEQ (%rdi), %ymm0, %ymm1
+L(loop_4x_vec_end):
+# endif
+ /* rawmemchr will fall through into this if match was found in
+ loop. */
+
vpmovmskb %ymm1, %eax
testl %eax, %eax
+ jnz L(last_vec_x1_return)
- jnz L(first_vec_x0_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
-
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm2, %eax
testl %eax, %eax
- jnz L(first_vec_x1_check)
- xorl %eax, %eax
- VZEROUPPER
- ret
+ jnz L(last_vec_x2_return)
- .p2align 4
-L(first_vec_x0_check):
- tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
+ vpmovmskb %ymm3, %eax
+ /* Combine VEC3 matches (eax) with VEC4 matches (ecx). */
+ salq $32, %rcx
+ orq %rcx, %rax
+ tzcntq %rax, %rax
+# ifdef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 2 - 1), %rdi
+# else
+ subq $-(VEC_SIZE * 2 + 1), %rdi
+# endif
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
+# ifndef USE_AS_RAWMEMCHR
.p2align 4
L(first_vec_x1_check):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $VEC_SIZE, %rax
+ /* Adjust length. */
+ subl $-(VEC_SIZE * 4), %edx
+ /* Check if match within remaining length. */
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ incq %rdi
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
+ .p2align 4
+L(set_zero_end):
+ xorl %eax, %eax
+ VZEROUPPER_RETURN
+# endif
.p2align 4
-L(first_vec_x2_check):
+L(last_vec_x1_return):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 2), %rax
+# ifdef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 4 - 1), %rdi
+# else
+ incq %rdi
+# endif
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x2_return):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 3), %rax
+# ifdef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 3 - 1), %rdi
+# else
+ subq $-(VEC_SIZE + 1), %rdi
+# endif
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
+# ifndef USE_AS_RAWMEMCHR
.p2align 4
-L(zero):
- VZEROUPPER
-L(null):
- xorl %eax, %eax
- ret
-# endif
+L(last_4x_vec_or_less_cmpeq):
+ VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %edx
+# endif
+ subq $-(VEC_SIZE * 4), %rdi
+ /* Check first VEC regardless. */
+ testl %eax, %eax
+ jnz L(first_vec_x1_check)
+ /* If remaining length <= CHAR_PER_VEC * 2. */
+ addl $(VEC_SIZE * 2), %edx
+ jle L(last_2x_vec)
.p2align 4
-L(first_vec_x0):
- tzcntl %eax, %eax
- addq %rdi, %rax
- VZEROUPPER
- ret
+L(last_4x_vec):
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2_return)
- .p2align 4
-L(first_vec_x1):
- tzcntl %eax, %eax
- addq $VEC_SIZE, %rax
- addq %rdi, %rax
- VZEROUPPER
- ret
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
- .p2align 4
-L(first_vec_x2):
+ /* Create mask for possible matches within remaining length. */
+ movq $-1, %rcx
+ bzhiq %rdx, %rcx, %rcx
+
+ /* Test matches in data against length match. */
+ andl %ecx, %eax
+ jnz L(last_vec_x3)
+
+ /* if remaining length <= VEC_SIZE * 3 (Note this is after
+ remaining length was found to be > VEC_SIZE * 2. */
+ subl $VEC_SIZE, %edx
+ jbe L(zero_end2)
+
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Shift remaining length mask for last VEC. */
+ shrq $32, %rcx
+ andl %ecx, %eax
+ jz L(zero_end2)
tzcntl %eax, %eax
- addq $(VEC_SIZE * 2), %rax
+ addq $(VEC_SIZE * 3 + 1), %rdi
addq %rdi, %rax
- VZEROUPPER
- ret
+L(zero_end2):
+ VZEROUPPER_RETURN
.p2align 4
-L(4x_vec_end):
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- vpmovmskb %ymm2, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
- vpmovmskb %ymm3, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
- vpmovmskb %ymm4, %eax
- testl %eax, %eax
-L(first_vec_x3):
+L(last_vec_x3):
tzcntl %eax, %eax
- addq $(VEC_SIZE * 3), %rax
+ subq $-(VEC_SIZE * 2 + 1), %rdi
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
+# endif
END (MEMCHR)
#endif
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
new file mode 100644
index 0000000000..f3fdad4fda
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -0,0 +1,478 @@
+/* memchr/wmemchr optimized with 256-bit EVEX instructions.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef MEMCHR
+# define MEMCHR __memchr_evex
+# endif
+
+# ifdef USE_AS_WMEMCHR
+# define VPBROADCAST vpbroadcastd
+# define VPMINU vpminud
+# define VPCMP vpcmpd
+# define VPCMPEQ vpcmpeqd
+# define CHAR_SIZE 4
+# else
+# define VPBROADCAST vpbroadcastb
+# define VPMINU vpminub
+# define VPCMP vpcmpb
+# define VPCMPEQ vpcmpeqb
+# define CHAR_SIZE 1
+# endif
+
+# ifdef USE_AS_RAWMEMCHR
+# define RAW_PTR_REG rcx
+# define ALGN_PTR_REG rdi
+# else
+# define RAW_PTR_REG rdi
+# define ALGN_PTR_REG rcx
+# endif
+
+# define XMMZERO xmm23
+# define YMMZERO ymm23
+# define XMMMATCH xmm16
+# define YMMMATCH ymm16
+# define YMM1 ymm17
+# define YMM2 ymm18
+# define YMM3 ymm19
+# define YMM4 ymm20
+# define YMM5 ymm21
+# define YMM6 ymm22
+
+# define VEC_SIZE 32
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+# define PAGE_SIZE 4096
+
+ .section .text.evex,"ax",@progbits
+ENTRY (MEMCHR)
+# ifndef USE_AS_RAWMEMCHR
+ /* Check for zero length. */
+ test %RDX_LP, %RDX_LP
+ jz L(zero)
+
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
+# endif
+# endif
+ /* Broadcast CHAR to YMMMATCH. */
+ VPBROADCAST %esi, %YMMMATCH
+ /* Check if we may cross page boundary with one vector load. */
+ movl %edi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page_boundary)
+
+ /* Check the first VEC_SIZE bytes. */
+ VPCMP $0, (%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* If length < CHAR_PER_VEC handle special. */
+ cmpq $CHAR_PER_VEC, %rdx
+ jbe L(first_vec_x0)
+# endif
+ testl %eax, %eax
+ jz L(aligned_more)
+ tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+# else
+ addq %rdi, %rax
+# endif
+ ret
+
+# ifndef USE_AS_RAWMEMCHR
+L(zero):
+ xorl %eax, %eax
+ ret
+
+ .p2align 5
+L(first_vec_x0):
+ /* Check if first match was before length. */
+ tzcntl %eax, %eax
+ xorl %ecx, %ecx
+ cmpl %eax, %edx
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+ cmovle %rcx, %rax
+ ret
+# else
+ /* NB: first_vec_x0 is 17 bytes which will leave
+ cross_page_boundary (which is relatively cold) close enough
+ to ideal alignment. So only realign L(cross_page_boundary) if
+ rawmemchr. */
+ .p2align 4
+# endif
+L(cross_page_boundary):
+ /* Save pointer before aligning as its original value is
+ necessary for computer return address if byte is found or
+ adjusting length if it is not and this is memchr. */
+ movq %rdi, %rcx
+ /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
+ for rawmemchr. */
+ andq $-VEC_SIZE, %ALGN_PTR_REG
+ VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
+ kmovd %k0, %r8d
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide shift count by 4 since each bit in K0 represent 4
+ bytes. */
+ sarl $2, %eax
+# endif
+# ifndef USE_AS_RAWMEMCHR
+ movl $(PAGE_SIZE / CHAR_SIZE), %esi
+ subl %eax, %esi
+# endif
+# ifdef USE_AS_WMEMCHR
+ andl $(CHAR_PER_VEC - 1), %eax
+# endif
+ /* Remove the leading bytes. */
+ sarxl %eax, %r8d, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Check the end of data. */
+ cmpq %rsi, %rdx
+ jbe L(first_vec_x0)
+# endif
+ testl %eax, %eax
+ jz L(cross_page_continue)
+ tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
+# else
+ addq %RAW_PTR_REG, %rax
+# endif
+ ret
+
+ .p2align 4
+L(first_vec_x1):
+ tzcntl %eax, %eax
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+
+ .p2align 4
+L(first_vec_x3):
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+
+ .p2align 4
+L(first_vec_x4):
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+
+ .p2align 5
+L(aligned_more):
+ /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
+ since data is only aligned to VEC_SIZE. */
+
+# ifndef USE_AS_RAWMEMCHR
+ /* Align data to VEC_SIZE. */
+L(cross_page_continue):
+ xorl %ecx, %ecx
+ subl %edi, %ecx
+ andq $-VEC_SIZE, %rdi
+ /* esi is for adjusting length to see if near the end. */
+ leal (VEC_SIZE * 5)(%rdi, %rcx), %esi
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %esi
+# endif
+# else
+ andq $-VEC_SIZE, %rdi
+L(cross_page_continue):
+# endif
+ /* Load first VEC regardless. */
+ VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Adjust length. If near end handle specially. */
+ subq %rsi, %rdx
+ jbe L(last_4x_vec_or_less)
+# endif
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x2)
+
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x3)
+
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x4)
+
+
+# ifndef USE_AS_RAWMEMCHR
+ /* Check if at last CHAR_PER_VEC * 4 length. */
+ subq $(CHAR_PER_VEC * 4), %rdx
+ jbe L(last_4x_vec_or_less_cmpeq)
+ addq $VEC_SIZE, %rdi
+
+ /* Align data to VEC_SIZE * 4 for the loop and readjust length.
+ */
+# ifdef USE_AS_WMEMCHR
+ movl %edi, %ecx
+ andq $-(4 * VEC_SIZE), %rdi
+ andl $(VEC_SIZE * 4 - 1), %ecx
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %ecx
+ addq %rcx, %rdx
+# else
+ addq %rdi, %rdx
+ andq $-(4 * VEC_SIZE), %rdi
+ subq %rdi, %rdx
+# endif
+# else
+ addq $VEC_SIZE, %rdi
+ andq $-(4 * VEC_SIZE), %rdi
+# endif
+
+ vpxorq %XMMZERO, %XMMZERO, %XMMZERO
+
+ /* Compare 4 * VEC at a time forward. */
+ .p2align 4
+L(loop_4x_vec):
+ /* It would be possible to save some instructions using 4x VPCMP
+ but bottleneck on port 5 makes it not woth it. */
+ VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
+ /* xor will set bytes match esi to zero. */
+ vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
+ vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
+ VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
+ /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
+ VPMINU %YMM2, %YMM3, %YMM3{%k1}{z}
+ VPCMP $0, %YMM3, %YMMZERO, %k2
+# ifdef USE_AS_RAWMEMCHR
+ subq $-(VEC_SIZE * 4), %rdi
+ kortestd %k2, %k3
+ jz L(loop_4x_vec)
+# else
+ kortestd %k2, %k3
+ jnz L(loop_4x_vec_end)
+
+ subq $-(VEC_SIZE * 4), %rdi
+
+ subq $(CHAR_PER_VEC * 4), %rdx
+ ja L(loop_4x_vec)
+
+ /* Fall through into less than 4 remaining vectors of length case.
+ */
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ addq $(VEC_SIZE * 3), %rdi
+ .p2align 4
+L(last_4x_vec_or_less):
+ /* Check if first VEC contained match. */
+ testl %eax, %eax
+ jnz L(first_vec_x1_check)
+
+ /* If remaining length > CHAR_PER_VEC * 2. */
+ addl $(CHAR_PER_VEC * 2), %edx
+ jg L(last_4x_vec)
+
+L(last_2x_vec):
+ /* If remaining length < CHAR_PER_VEC. */
+ addl $CHAR_PER_VEC, %edx
+ jle L(zero_end)
+
+ /* Check VEC2 and compare any match with remaining length. */
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ tzcntl %eax, %eax
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end):
+ ret
+
+
+ .p2align 4
+L(first_vec_x1_check):
+ tzcntl %eax, %eax
+ /* Adjust length. */
+ subl $-(CHAR_PER_VEC * 4), %edx
+ /* Check if match within remaining length. */
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+L(set_zero_end):
+ xorl %eax, %eax
+ ret
+
+ .p2align 4
+L(loop_4x_vec_end):
+# endif
+ /* rawmemchr will fall through into this if match was found in
+ loop. */
+
+ /* k1 has not of matches with VEC1. */
+ kmovd %k1, %eax
+# ifdef USE_AS_WMEMCHR
+ subl $((1 << CHAR_PER_VEC) - 1), %eax
+# else
+ incl %eax
+# endif
+ jnz L(last_vec_x1_return)
+
+ VPCMP $0, %YMM2, %YMMZERO, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2_return)
+
+ kmovd %k2, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x3_return)
+
+ kmovd %k3, %eax
+ tzcntl %eax, %eax
+# ifdef USE_AS_RAWMEMCHR
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+# else
+ leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
+# endif
+ ret
+
+ .p2align 4
+L(last_vec_x1_return):
+ tzcntl %eax, %eax
+# ifdef USE_AS_RAWMEMCHR
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+# else
+ addq %rdi, %rax
+# endif
+# else
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+# endif
+ ret
+
+ .p2align 4
+L(last_vec_x2_return):
+ tzcntl %eax, %eax
+# ifdef USE_AS_RAWMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+# else
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
+# endif
+ ret
+
+ .p2align 4
+L(last_vec_x3_return):
+ tzcntl %eax, %eax
+# ifdef USE_AS_RAWMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+# else
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
+# endif
+ ret
+
+
+# ifndef USE_AS_RAWMEMCHR
+L(last_4x_vec_or_less_cmpeq):
+ VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ subq $-(VEC_SIZE * 4), %rdi
+ /* Check first VEC regardless. */
+ testl %eax, %eax
+ jnz L(first_vec_x1_check)
+
+ /* If remaining length <= CHAR_PER_VEC * 2. */
+ addl $(CHAR_PER_VEC * 2), %edx
+ jle L(last_2x_vec)
+
+ .p2align 4
+L(last_4x_vec):
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2)
+
+
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ /* Create mask for possible matches within remaining length. */
+# ifdef USE_AS_WMEMCHR
+ movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
+ bzhil %edx, %ecx, %ecx
+# else
+ movq $-1, %rcx
+ bzhiq %rdx, %rcx, %rcx
+# endif
+ /* Test matches in data against length match. */
+ andl %ecx, %eax
+ jnz L(last_vec_x3)
+
+ /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
+ remaining length was found to be > CHAR_PER_VEC * 2. */
+ subl $CHAR_PER_VEC, %edx
+ jbe L(zero_end2)
+
+
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ /* Shift remaining length mask for last VEC. */
+# ifdef USE_AS_WMEMCHR
+ shrl $CHAR_PER_VEC, %ecx
+# else
+ shrq $CHAR_PER_VEC, %rcx
+# endif
+ andl %ecx, %eax
+ jz L(zero_end2)
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end2):
+ ret
+
+L(last_vec_x2):
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+
+ .p2align 4
+L(last_vec_x3):
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+# endif
+
+END (MEMCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S
new file mode 100644
index 0000000000..cf4eff5d4a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S
@@ -0,0 +1,12 @@
+#ifndef MEMCMP
+# define MEMCMP __memcmp_avx2_movbe_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "memcmp-avx2-movbe.S"
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
index d779639a61..0a9eab7da1 100644
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
@@ -47,6 +47,10 @@
# define VZEROUPPER vzeroupper
# endif
+# ifndef SECTION
+# define SECTION(p) p##.avx
+# endif
+
# define VEC_SIZE 32
# define VEC_MASK ((1 << VEC_SIZE) - 1)
@@ -55,7 +59,7 @@
memcmp has to use UNSIGNED comparison for elemnts.
*/
- .section .text.avx,"ax",@progbits
+ .section SECTION(.text),"ax",@progbits
ENTRY (MEMCMP)
# ifdef USE_AS_WMEMCMP
shl $2, %RDX_LP
@@ -123,8 +127,8 @@ ENTRY (MEMCMP)
vptest %ymm0, %ymm5
jnc L(4x_vec_end)
xorl %eax, %eax
- VZEROUPPER
- ret
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
L(last_2x_vec):
@@ -144,8 +148,7 @@ L(last_vec):
vpmovmskb %ymm2, %eax
subl $VEC_MASK, %eax
jnz L(first_vec)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(first_vec):
@@ -164,8 +167,7 @@ L(wmemcmp_return):
movzbl (%rsi, %rcx), %edx
sub %edx, %eax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
# ifdef USE_AS_WMEMCMP
.p2align 4
@@ -367,8 +369,7 @@ L(last_4x_vec):
vpmovmskb %ymm2, %eax
subl $VEC_MASK, %eax
jnz L(first_vec)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(4x_vec_end):
@@ -394,8 +395,7 @@ L(4x_vec_end):
movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx
sub %edx, %eax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(first_vec_x1):
@@ -410,8 +410,7 @@ L(first_vec_x1):
movzbl VEC_SIZE(%rsi, %rcx), %edx
sub %edx, %eax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(first_vec_x2):
@@ -426,7 +425,6 @@ L(first_vec_x2):
movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx
sub %edx, %eax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
END (MEMCMP)
#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
new file mode 100644
index 0000000000..9c093972e1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
@@ -0,0 +1,440 @@
+/* memcmp/wmemcmp optimized with 256-bit EVEX instructions.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+/* memcmp/wmemcmp is implemented as:
+ 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
+ to avoid branches.
+ 2. Use overlapping compare to avoid branch.
+ 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
+ bytes for wmemcmp.
+ 4. If size is 8 * VEC_SIZE or less, unroll the loop.
+ 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
+ area.
+ 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+ 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+ 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+# define MEMCMP __memcmp_evex_movbe
+# endif
+
+# define VMOVU vmovdqu64
+
+# ifdef USE_AS_WMEMCMP
+# define VPCMPEQ vpcmpeqd
+# else
+# define VPCMPEQ vpcmpeqb
+# endif
+
+# define XMM1 xmm17
+# define XMM2 xmm18
+# define YMM1 ymm17
+# define YMM2 ymm18
+# define YMM3 ymm19
+# define YMM4 ymm20
+# define YMM5 ymm21
+# define YMM6 ymm22
+
+# define VEC_SIZE 32
+# ifdef USE_AS_WMEMCMP
+# define VEC_MASK 0xff
+# define XMM_MASK 0xf
+# else
+# define VEC_MASK 0xffffffff
+# define XMM_MASK 0xffff
+# endif
+
+/* Warning!
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+ .section .text.evex,"ax",@progbits
+ENTRY (MEMCMP)
+# ifdef USE_AS_WMEMCMP
+ shl $2, %RDX_LP
+# elif defined __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
+# endif
+ cmp $VEC_SIZE, %RDX_LP
+ jb L(less_vec)
+
+ /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k1
+ kmovd %k1, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+ cmpq $(VEC_SIZE * 2), %rdx
+ jbe L(last_vec)
+
+ /* More than 2 * VEC. */
+ cmpq $(VEC_SIZE * 8), %rdx
+ ja L(more_8x_vec)
+ cmpq $(VEC_SIZE * 4), %rdx
+ jb L(last_4x_vec)
+
+ /* From 4 * VEC to 8 * VEC, inclusively. */
+ VMOVU (%rsi), %YMM1
+ VPCMPEQ (%rdi), %YMM1, %k1
+
+ VMOVU VEC_SIZE(%rsi), %YMM2
+ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+
+ kandd %k1, %k2, %k5
+ kandd %k3, %k4, %k6
+ kandd %k5, %k6, %k6
+
+ kmovd %k6, %eax
+ cmpl $VEC_MASK, %eax
+ jne L(4x_vec_end)
+
+ leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
+ leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
+ VMOVU (%rsi), %YMM1
+ VPCMPEQ (%rdi), %YMM1, %k1
+
+ VMOVU VEC_SIZE(%rsi), %YMM2
+ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+ kandd %k1, %k2, %k5
+
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+ kandd %k3, %k5, %k5
+
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+ kandd %k4, %k5, %k5
+
+ kmovd %k5, %eax
+ cmpl $VEC_MASK, %eax
+ jne L(4x_vec_end)
+ xorl %eax, %eax
+ ret
+
+ .p2align 4
+L(last_2x_vec):
+ /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+L(last_vec):
+ /* Use overlapping loads to avoid branches. */
+ leaq -VEC_SIZE(%rdi, %rdx), %rdi
+ leaq -VEC_SIZE(%rsi, %rdx), %rsi
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+ ret
+
+ .p2align 4
+L(first_vec):
+ /* A byte or int32 is different within 16 or 32 bytes. */
+ tzcntl %eax, %ecx
+# ifdef USE_AS_WMEMCMP
+ xorl %eax, %eax
+ movl (%rdi, %rcx, 4), %edx
+ cmpl (%rsi, %rcx, 4), %edx
+L(wmemcmp_return):
+ setl %al
+ negl %eax
+ orl $1, %eax
+# else
+ movzbl (%rdi, %rcx), %eax
+ movzbl (%rsi, %rcx), %edx
+ sub %edx, %eax
+# endif
+ ret
+
+# ifdef USE_AS_WMEMCMP
+ .p2align 4
+L(4):
+ xorl %eax, %eax
+ movl (%rdi), %edx
+ cmpl (%rsi), %edx
+ jne L(wmemcmp_return)
+ ret
+# else
+ .p2align 4
+L(between_4_7):
+ /* Load as big endian with overlapping movbe to avoid branches. */
+ movbe (%rdi), %eax
+ movbe (%rsi), %ecx
+ shlq $32, %rax
+ shlq $32, %rcx
+ movbe -4(%rdi, %rdx), %edi
+ movbe -4(%rsi, %rdx), %esi
+ orq %rdi, %rax
+ orq %rsi, %rcx
+ subq %rcx, %rax
+ je L(exit)
+ sbbl %eax, %eax
+ orl $1, %eax
+ ret
+
+ .p2align 4
+L(exit):
+ ret
+
+ .p2align 4
+L(between_2_3):
+ /* Load as big endian to avoid branches. */
+ movzwl (%rdi), %eax
+ movzwl (%rsi), %ecx
+ shll $8, %eax
+ shll $8, %ecx
+ bswap %eax
+ bswap %ecx
+ movb -1(%rdi, %rdx), %al
+ movb -1(%rsi, %rdx), %cl
+ /* Subtraction is okay because the upper 8 bits are zero. */
+ subl %ecx, %eax
+ ret
+
+ .p2align 4
+L(1):
+ movzbl (%rdi), %eax
+ movzbl (%rsi), %ecx
+ subl %ecx, %eax
+ ret
+# endif
+
+ .p2align 4
+L(zero):
+ xorl %eax, %eax
+ ret
+
+ .p2align 4
+L(less_vec):
+# ifdef USE_AS_WMEMCMP
+ /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */
+ cmpb $4, %dl
+ je L(4)
+ jb L(zero)
+# else
+ cmpb $1, %dl
+ je L(1)
+ jb L(zero)
+ cmpb $4, %dl
+ jb L(between_2_3)
+ cmpb $8, %dl
+ jb L(between_4_7)
+# endif
+ cmpb $16, %dl
+ jae L(between_16_31)
+ /* It is between 8 and 15 bytes. */
+ vmovq (%rdi), %XMM1
+ vmovq (%rsi), %XMM2
+ VPCMPEQ %XMM1, %XMM2, %k2
+ kmovw %k2, %eax
+ subl $XMM_MASK, %eax
+ jnz L(first_vec)
+ /* Use overlapping loads to avoid branches. */
+ leaq -8(%rdi, %rdx), %rdi
+ leaq -8(%rsi, %rdx), %rsi
+ vmovq (%rdi), %XMM1
+ vmovq (%rsi), %XMM2
+ VPCMPEQ %XMM1, %XMM2, %k2
+ kmovw %k2, %eax
+ subl $XMM_MASK, %eax
+ jnz L(first_vec)
+ ret
+
+ .p2align 4
+L(between_16_31):
+ /* From 16 to 31 bytes. No branch when size == 16. */
+ VMOVU (%rsi), %XMM2
+ VPCMPEQ (%rdi), %XMM2, %k2
+ kmovw %k2, %eax
+ subl $XMM_MASK, %eax
+ jnz L(first_vec)
+
+ /* Use overlapping loads to avoid branches. */
+ leaq -16(%rdi, %rdx), %rdi
+ leaq -16(%rsi, %rdx), %rsi
+ VMOVU (%rsi), %XMM2
+ VPCMPEQ (%rdi), %XMM2, %k2
+ kmovw %k2, %eax
+ subl $XMM_MASK, %eax
+ jnz L(first_vec)
+ ret
+
+ .p2align 4
+L(more_8x_vec):
+ /* More than 8 * VEC. Check the first VEC. */
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+ /* Align the first memory area for aligned loads in the loop.
+ Compute how much the first memory area is misaligned. */
+ movq %rdi, %rcx
+ andl $(VEC_SIZE - 1), %ecx
+ /* Get the negative of offset for alignment. */
+ subq $VEC_SIZE, %rcx
+ /* Adjust the second memory area. */
+ subq %rcx, %rsi
+ /* Adjust the first memory area which should be aligned now. */
+ subq %rcx, %rdi
+ /* Adjust length. */
+ addq %rcx, %rdx
+
+L(loop_4x_vec):
+ /* Compare 4 * VEC at a time forward. */
+ VMOVU (%rsi), %YMM1
+ VPCMPEQ (%rdi), %YMM1, %k1
+
+ VMOVU VEC_SIZE(%rsi), %YMM2
+ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+ kandd %k2, %k1, %k5
+
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+ kandd %k3, %k5, %k5
+
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+ kandd %k4, %k5, %k5
+
+ kmovd %k5, %eax
+ cmpl $VEC_MASK, %eax
+ jne L(4x_vec_end)
+
+ addq $(VEC_SIZE * 4), %rdi
+ addq $(VEC_SIZE * 4), %rsi
+
+ subq $(VEC_SIZE * 4), %rdx
+ cmpq $(VEC_SIZE * 4), %rdx
+ jae L(loop_4x_vec)
+
+ /* Less than 4 * VEC. */
+ cmpq $VEC_SIZE, %rdx
+ jbe L(last_vec)
+ cmpq $(VEC_SIZE * 2), %rdx
+ jbe L(last_2x_vec)
+
+L(last_4x_vec):
+ /* From 2 * VEC to 4 * VEC. */
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+ addq $VEC_SIZE, %rdi
+ addq $VEC_SIZE, %rsi
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+ /* Use overlapping loads to avoid branches. */
+ leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi
+ leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+ addq $VEC_SIZE, %rdi
+ addq $VEC_SIZE, %rsi
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+ ret
+
+ .p2align 4
+L(4x_vec_end):
+ kmovd %k1, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec_x1)
+ kmovd %k3, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec_x2)
+ kmovd %k4, %eax
+ subl $VEC_MASK, %eax
+ tzcntl %eax, %ecx
+# ifdef USE_AS_WMEMCMP
+ xorl %eax, %eax
+ movl (VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
+ cmpl (VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
+ jmp L(wmemcmp_return)
+# else
+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx
+ sub %edx, %eax
+# endif
+ ret
+
+ .p2align 4
+L(first_vec_x1):
+ tzcntl %eax, %ecx
+# ifdef USE_AS_WMEMCMP
+ xorl %eax, %eax
+ movl VEC_SIZE(%rdi, %rcx, 4), %edx
+ cmpl VEC_SIZE(%rsi, %rcx, 4), %edx
+ jmp L(wmemcmp_return)
+# else
+ movzbl VEC_SIZE(%rdi, %rcx), %eax
+ movzbl VEC_SIZE(%rsi, %rcx), %edx
+ sub %edx, %eax
+# endif
+ ret
+
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %ecx
+# ifdef USE_AS_WMEMCMP
+ xorl %eax, %eax
+ movl (VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
+ cmpl (VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
+ jmp L(wmemcmp_return)
+# else
+ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
+ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx
+ sub %edx, %eax
+# endif
+ ret
+END (MEMCMP)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
new file mode 100644
index 0000000000..1ec1962e86
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
@@ -0,0 +1,17 @@
+#if IS_IN (libc)
+# define VEC_SIZE 32
+# define VEC(i) ymm##i
+# define VMOVNT vmovntdq
+# define VMOVU vmovdqu
+# define VMOVA vmovdqa
+
+# define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+# define VZEROUPPER_RETURN jmp L(return)
+
+# define SECTION(p) p##.avx.rtm
+# define MEMMOVE_SYMBOL(p,s) p##_avx_##s##_rtm
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
index aac1515cf6..7dad1ad74c 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -1,11 +1,25 @@
#if IS_IN (libc)
# define VEC_SIZE 64
-# define VEC(i) zmm##i
+# define XMM0 xmm16
+# define XMM1 xmm17
+# define YMM0 ymm16
+# define YMM1 ymm17
+# define VEC0 zmm16
+# define VEC1 zmm17
+# define VEC2 zmm18
+# define VEC3 zmm19
+# define VEC4 zmm20
+# define VEC5 zmm21
+# define VEC6 zmm22
+# define VEC7 zmm23
+# define VEC8 zmm24
+# define VEC(i) VEC##i
# define VMOVNT vmovntdq
# define VMOVU vmovdqu64
# define VMOVA vmovdqa64
+# define VZEROUPPER
-# define SECTION(p) p##.avx512
+# define SECTION(p) p##.evex512
# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s
# include "memmove-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
new file mode 100644
index 0000000000..b879007e89
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
@@ -0,0 +1,26 @@
+#if IS_IN (libc)
+# define VEC_SIZE 32
+# define XMM0 xmm16
+# define XMM1 xmm17
+# define YMM0 ymm16
+# define YMM1 ymm17
+# define VEC0 ymm16
+# define VEC1 ymm17
+# define VEC2 ymm18
+# define VEC3 ymm19
+# define VEC4 ymm20
+# define VEC5 ymm21
+# define VEC6 ymm22
+# define VEC7 ymm23
+# define VEC8 ymm24
+# define VEC(i) VEC##i
+# define VMOVNT vmovntdq
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
+# define VZEROUPPER
+
+# define SECTION(p) p##.evex
+# define MEMMOVE_SYMBOL(p,s) p##_evex_##s
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 95e9bb0e5d..6f599eff16 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -48,6 +48,14 @@
# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
#endif
+#ifndef XMM0
+# define XMM0 xmm0
+#endif
+
+#ifndef YMM0
+# define YMM0 ymm0
+#endif
+
#ifndef VZEROUPPER
# if VEC_SIZE > 16
# define VZEROUPPER vzeroupper
@@ -150,11 +158,12 @@ L(last_2x_vec):
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
- VZEROUPPER
#if !defined USE_MULTIARCH || !IS_IN (libc)
L(nop):
-#endif
ret
+#else
+ VZEROUPPER_RETURN
+#endif
#if defined USE_MULTIARCH && IS_IN (libc)
END (MEMMOVE_SYMBOL (__memmove, unaligned))
@@ -247,8 +256,11 @@ L(last_2x_vec):
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
L(return):
- VZEROUPPER
+#if VEC_SIZE > 16
+ ZERO_UPPER_VEC_REGISTERS_RETURN
+#else
ret
+#endif
L(movsb):
cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
@@ -312,21 +324,20 @@ L(less_vec):
#if VEC_SIZE > 32
L(between_32_63):
/* From 32 to 63. No branch when size == 32. */
- vmovdqu (%rsi), %ymm0
- vmovdqu -32(%rsi,%rdx), %ymm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, -32(%rdi,%rdx)
- VZEROUPPER
- ret
+ VMOVU (%rsi), %YMM0
+ VMOVU -32(%rsi,%rdx), %YMM1
+ VMOVU %YMM0, (%rdi)
+ VMOVU %YMM1, -32(%rdi,%rdx)
+ VZEROUPPER_RETURN
#endif
#if VEC_SIZE > 16
/* From 16 to 31. No branch when size == 16. */
L(between_16_31):
- vmovdqu (%rsi), %xmm0
- vmovdqu -16(%rsi,%rdx), %xmm1
- vmovdqu %xmm0, (%rdi)
- vmovdqu %xmm1, -16(%rdi,%rdx)
- ret
+ VMOVU (%rsi), %XMM0
+ VMOVU -16(%rsi,%rdx), %XMM1
+ VMOVU %XMM0, (%rdi)
+ VMOVU %XMM1, -16(%rdi,%rdx)
+ VZEROUPPER_RETURN
#endif
L(between_8_15):
/* From 8 to 15. No branch when size == 8. */
@@ -379,8 +390,7 @@ L(more_2x_vec):
VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
L(last_4x_vec):
/* Copy from 2 * VEC to 4 * VEC. */
VMOVU (%rsi), %VEC(0)
@@ -391,8 +401,7 @@ L(last_4x_vec):
VMOVU %VEC(1), VEC_SIZE(%rdi)
VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
L(more_8x_vec):
cmpq %rsi, %rdi
@@ -448,8 +457,7 @@ L(loop_4x_vec_forward):
VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
/* Store the first VEC. */
VMOVU %VEC(4), (%r11)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
L(more_8x_vec_backward):
/* Load the first 4 * VEC and last VEC to support overlapping
@@ -500,8 +508,7 @@ L(loop_4x_vec_backward):
VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
/* Store the last VEC. */
VMOVU %VEC(8), (%r11)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
L(large_forward):
@@ -536,8 +543,7 @@ L(loop_large_forward):
VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
/* Store the first VEC. */
VMOVU %VEC(4), (%r11)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
L(large_backward):
/* Don't use non-temporal store if there is overlap between
@@ -571,8 +577,7 @@ L(loop_large_backward):
VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
/* Store the last VEC. */
VMOVU %VEC(8), (%r11)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
#endif
END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
new file mode 100644
index 0000000000..cea2d2a72d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef MEMRCHR
+# define MEMRCHR __memrchr_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "memrchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
index e19a732396..a9c33e47b3 100644
--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
@@ -20,14 +20,22 @@
# include <sysdep.h>
+# ifndef MEMRCHR
+# define MEMRCHR __memrchr_avx2
+# endif
+
# ifndef VZEROUPPER
# define VZEROUPPER vzeroupper
# endif
+# ifndef SECTION
+# define SECTION(p) p##.avx
+# endif
+
# define VEC_SIZE 32
- .section .text.avx,"ax",@progbits
-ENTRY (__memrchr_avx2)
+ .section SECTION(.text),"ax",@progbits
+ENTRY (MEMRCHR)
/* Broadcast CHAR to YMM0. */
vmovd %esi, %xmm0
vpbroadcastb %xmm0, %ymm0
@@ -134,8 +142,8 @@ L(loop_4x_vec):
vpmovmskb %ymm1, %eax
bsrl %eax, %eax
addq %rdi, %rax
- VZEROUPPER
- ret
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
L(last_4x_vec_or_less):
@@ -169,8 +177,7 @@ L(last_4x_vec_or_less):
addq %rax, %rdx
jl L(zero)
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(last_2x_vec):
@@ -191,31 +198,27 @@ L(last_2x_vec):
jl L(zero)
addl $(VEC_SIZE * 2), %eax
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(last_vec_x0):
bsrl %eax, %eax
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(last_vec_x1):
bsrl %eax, %eax
addl $VEC_SIZE, %eax
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(last_vec_x2):
bsrl %eax, %eax
addl $(VEC_SIZE * 2), %eax
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(last_vec_x3):
@@ -232,8 +235,7 @@ L(last_vec_x1_check):
jl L(zero)
addl $VEC_SIZE, %eax
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(last_vec_x3_check):
@@ -243,12 +245,14 @@ L(last_vec_x3_check):
jl L(zero)
addl $(VEC_SIZE * 3), %eax
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(zero):
- VZEROUPPER
+ xorl %eax, %eax
+ VZEROUPPER_RETURN
+
+ .p2align 4
L(null):
xorl %eax, %eax
ret
@@ -273,8 +277,7 @@ L(last_vec_or_less_aligned):
bsrl %eax, %eax
addq %rdi, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(last_vec_or_less):
@@ -315,8 +318,7 @@ L(last_vec_or_less):
bsrl %eax, %eax
addq %rdi, %rax
addq %r8, %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(last_vec_2x_aligned):
@@ -353,7 +355,6 @@ L(last_vec_2x_aligned):
bsrl %eax, %eax
addq %rdi, %rax
addq %r8, %rax
- VZEROUPPER
- ret
-END (__memrchr_avx2)
+ VZEROUPPER_RETURN
+END (MEMRCHR)
#endif
diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
new file mode 100644
index 0000000000..16bf8e02b1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
@@ -0,0 +1,337 @@
+/* memrchr optimized with 256-bit EVEX instructions.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define VMOVA vmovdqa64
+
+# define YMMMATCH ymm16
+
+# define VEC_SIZE 32
+
+ .section .text.evex,"ax",@progbits
+ENTRY (__memrchr_evex)
+ /* Broadcast CHAR to YMMMATCH. */
+ vpbroadcastb %esi, %YMMMATCH
+
+ sub $VEC_SIZE, %RDX_LP
+ jbe L(last_vec_or_less)
+
+ add %RDX_LP, %RDI_LP
+
+ /* Check the last VEC_SIZE bytes. */
+ vpcmpb $0, (%rdi), %YMMMATCH, %k1
+ kmovd %k1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x0)
+
+ subq $(VEC_SIZE * 4), %rdi
+ movl %edi, %ecx
+ andl $(VEC_SIZE - 1), %ecx
+ jz L(aligned_more)
+
+ /* Align data for aligned loads in the loop. */
+ addq $VEC_SIZE, %rdi
+ addq $VEC_SIZE, %rdx
+ andq $-VEC_SIZE, %rdi
+ subq %rcx, %rdx
+
+ .p2align 4
+L(aligned_more):
+ subq $(VEC_SIZE * 4), %rdx
+ jbe L(last_4x_vec_or_less)
+
+ /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time
+ since data is only aligned to VEC_SIZE. */
+ vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
+ kmovd %k1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x3)
+
+ vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
+ kmovd %k2, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2)
+
+ vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3
+ kmovd %k3, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x1)
+
+ vpcmpb $0, (%rdi), %YMMMATCH, %k4
+ kmovd %k4, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x0)
+
+ /* Align data to 4 * VEC_SIZE for loop with fewer branches.
+ There are some overlaps with above if data isn't aligned
+ to 4 * VEC_SIZE. */
+ movl %edi, %ecx
+ andl $(VEC_SIZE * 4 - 1), %ecx
+ jz L(loop_4x_vec)
+
+ addq $(VEC_SIZE * 4), %rdi
+ addq $(VEC_SIZE * 4), %rdx
+ andq $-(VEC_SIZE * 4), %rdi
+ subq %rcx, %rdx
+
+ .p2align 4
+L(loop_4x_vec):
+ /* Compare 4 * VEC at a time forward. */
+ subq $(VEC_SIZE * 4), %rdi
+ subq $(VEC_SIZE * 4), %rdx
+ jbe L(last_4x_vec_or_less)
+
+ vpcmpb $0, (%rdi), %YMMMATCH, %k1
+ vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
+ kord %k1, %k2, %k5
+ vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
+ vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
+
+ kord %k3, %k4, %k6
+ kortestd %k5, %k6
+ jz L(loop_4x_vec)
+
+ /* There is a match. */
+ kmovd %k4, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x3)
+
+ kmovd %k3, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2)
+
+ kmovd %k2, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x1)
+
+ kmovd %k1, %eax
+ bsrl %eax, %eax
+ addq %rdi, %rax
+ ret
+
+ .p2align 4
+L(last_4x_vec_or_less):
+ addl $(VEC_SIZE * 4), %edx
+ cmpl $(VEC_SIZE * 2), %edx
+ jbe L(last_2x_vec)
+
+ vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
+ kmovd %k1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x3)
+
+ vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
+ kmovd %k2, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2)
+
+ vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3
+ kmovd %k3, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x1_check)
+ cmpl $(VEC_SIZE * 3), %edx
+ jbe L(zero)
+
+ vpcmpb $0, (%rdi), %YMMMATCH, %k4
+ kmovd %k4, %eax
+ testl %eax, %eax
+ jz L(zero)
+ bsrl %eax, %eax
+ subq $(VEC_SIZE * 4), %rdx
+ addq %rax, %rdx
+ jl L(zero)
+ addq %rdi, %rax
+ ret
+
+ .p2align 4
+L(last_2x_vec):
+ vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
+ kmovd %k1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x3_check)
+ cmpl $VEC_SIZE, %edx
+ jbe L(zero)
+
+ vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
+ kmovd %k1, %eax
+ testl %eax, %eax
+ jz L(zero)
+ bsrl %eax, %eax
+ subq $(VEC_SIZE * 2), %rdx
+ addq %rax, %rdx
+ jl L(zero)
+ addl $(VEC_SIZE * 2), %eax
+ addq %rdi, %rax
+ ret
+
+ .p2align 4
+L(last_vec_x0):
+ bsrl %eax, %eax
+ addq %rdi, %rax
+ ret
+
+ .p2align 4
+L(last_vec_x1):
+ bsrl %eax, %eax
+ addl $VEC_SIZE, %eax
+ addq %rdi, %rax
+ ret
+
+ .p2align 4
+L(last_vec_x2):
+ bsrl %eax, %eax
+ addl $(VEC_SIZE * 2), %eax
+ addq %rdi, %rax
+ ret
+
+ .p2align 4
+L(last_vec_x3):
+ bsrl %eax, %eax
+ addl $(VEC_SIZE * 3), %eax
+ addq %rdi, %rax
+ ret
+
+ .p2align 4
+L(last_vec_x1_check):
+ bsrl %eax, %eax
+ subq $(VEC_SIZE * 3), %rdx
+ addq %rax, %rdx
+ jl L(zero)
+ addl $VEC_SIZE, %eax
+ addq %rdi, %rax
+ ret
+
+ .p2align 4
+L(last_vec_x3_check):
+ bsrl %eax, %eax
+ subq $VEC_SIZE, %rdx
+ addq %rax, %rdx
+ jl L(zero)
+ addl $(VEC_SIZE * 3), %eax
+ addq %rdi, %rax
+ ret
+
+ .p2align 4
+L(zero):
+ xorl %eax, %eax
+ ret
+
+ .p2align 4
+L(last_vec_or_less_aligned):
+ movl %edx, %ecx
+
+ vpcmpb $0, (%rdi), %YMMMATCH, %k1
+
+ movl $1, %edx
+ /* Support rdx << 32. */
+ salq %cl, %rdx
+ subq $1, %rdx
+
+ kmovd %k1, %eax
+
+ /* Remove the trailing bytes. */
+ andl %edx, %eax
+ testl %eax, %eax
+ jz L(zero)
+
+ bsrl %eax, %eax
+ addq %rdi, %rax
+ ret
+
+ .p2align 4
+L(last_vec_or_less):
+ addl $VEC_SIZE, %edx
+
+ /* Check for zero length. */
+ testl %edx, %edx
+ jz L(zero)
+
+ movl %edi, %ecx
+ andl $(VEC_SIZE - 1), %ecx
+ jz L(last_vec_or_less_aligned)
+
+ movl %ecx, %esi
+ movl %ecx, %r8d
+ addl %edx, %esi
+ andq $-VEC_SIZE, %rdi
+
+ subl $VEC_SIZE, %esi
+ ja L(last_vec_2x_aligned)
+
+ /* Check the last VEC. */
+ vpcmpb $0, (%rdi), %YMMMATCH, %k1
+ kmovd %k1, %eax
+
+ /* Remove the leading and trailing bytes. */
+ sarl %cl, %eax
+ movl %edx, %ecx
+
+ movl $1, %edx
+ sall %cl, %edx
+ subl $1, %edx
+
+ andl %edx, %eax
+ testl %eax, %eax
+ jz L(zero)
+
+ bsrl %eax, %eax
+ addq %rdi, %rax
+ addq %r8, %rax
+ ret
+
+ .p2align 4
+L(last_vec_2x_aligned):
+ movl %esi, %ecx
+
+ /* Check the last VEC. */
+ vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+
+ movl $1, %edx
+ sall %cl, %edx
+ subl $1, %edx
+
+ kmovd %k1, %eax
+
+ /* Remove the trailing bytes. */
+ andl %edx, %eax
+
+ testl %eax, %eax
+ jnz L(last_vec_x1)
+
+ /* Check the second last VEC. */
+ vpcmpb $0, (%rdi), %YMMMATCH, %k1
+
+ movl %r8d, %ecx
+
+ kmovd %k1, %eax
+
+ /* Remove the leading bytes. Must use unsigned right shift for
+ bsrl below. */
+ shrl %cl, %eax
+ testl %eax, %eax
+ jz L(zero)
+
+ bsrl %eax, %eax
+ addq %rdi, %rax
+ addq %r8, %rax
+ ret
+END (__memrchr_evex)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
new file mode 100644
index 0000000000..8ac3e479bb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
@@ -0,0 +1,10 @@
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return)
+
+#define SECTION(p) p##.avx.rtm
+#define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm
+#define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm
+
+#include "memset-avx2-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index 7ab3d89849..ae0860f36a 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -14,9 +14,15 @@
movq r, %rax; \
vpbroadcastd %xmm0, %ymm0
-# define SECTION(p) p##.avx
-# define MEMSET_SYMBOL(p,s) p##_avx2_##s
-# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
+# ifndef SECTION
+# define SECTION(p) p##.avx
+# endif
+# ifndef MEMSET_SYMBOL
+# define MEMSET_SYMBOL(p,s) p##_avx2_##s
+# endif
+# ifndef WMEMSET_SYMBOL
+# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
+# endif
# include "memset-vec-unaligned-erms.S"
#endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index 0783979ca5..22e7b187c8 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -1,22 +1,22 @@
#if IS_IN (libc)
# define VEC_SIZE 64
-# define VEC(i) zmm##i
+# define XMM0 xmm16
+# define YMM0 ymm16
+# define VEC0 zmm16
+# define VEC(i) VEC##i
# define VMOVU vmovdqu64
# define VMOVA vmovdqa64
+# define VZEROUPPER
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- vmovd d, %xmm0; \
movq r, %rax; \
- vpbroadcastb %xmm0, %xmm0; \
- vpbroadcastq %xmm0, %zmm0
+ vpbroadcastb d, %VEC0
# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- vmovd d, %xmm0; \
movq r, %rax; \
- vpbroadcastd %xmm0, %xmm0; \
- vpbroadcastq %xmm0, %zmm0
+ vpbroadcastd d, %VEC0
-# define SECTION(p) p##.avx512
+# define SECTION(p) p##.evex512
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
# define WMEMSET_SYMBOL(p,s) p##_avx512_##s
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
new file mode 100644
index 0000000000..ae0a4d6e46
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -0,0 +1,24 @@
+#if IS_IN (libc)
+# define VEC_SIZE 32
+# define XMM0 xmm16
+# define YMM0 ymm16
+# define VEC0 ymm16
+# define VEC(i) VEC##i
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
+# define VZEROUPPER
+
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ movq r, %rax; \
+ vpbroadcastb d, %VEC0
+
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ movq r, %rax; \
+ vpbroadcastd d, %VEC0
+
+# define SECTION(p) p##.evex
+# define MEMSET_SYMBOL(p,s) p##_evex_##s
+# define WMEMSET_SYMBOL(p,s) p##_evex_##s
+
+# include "memset-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 5e0d307d85..375844fff4 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -34,20 +34,25 @@
# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
#endif
+#ifndef XMM0
+# define XMM0 xmm0
+#endif
+
+#ifndef YMM0
+# define YMM0 ymm0
+#endif
+
#ifndef VZEROUPPER
# if VEC_SIZE > 16
# define VZEROUPPER vzeroupper
+# define VZEROUPPER_SHORT_RETURN vzeroupper; ret
# else
# define VZEROUPPER
# endif
#endif
#ifndef VZEROUPPER_SHORT_RETURN
-# if VEC_SIZE > 16
-# define VZEROUPPER_SHORT_RETURN vzeroupper
-# else
-# define VZEROUPPER_SHORT_RETURN rep
-# endif
+# define VZEROUPPER_SHORT_RETURN rep; ret
#endif
#ifndef MOVQ
@@ -77,7 +82,7 @@
ENTRY (__bzero)
mov %RDI_LP, %RAX_LP /* Set return value. */
mov %RSI_LP, %RDX_LP /* Set n. */
- pxor %xmm0, %xmm0
+ pxor %XMM0, %XMM0
jmp L(entry_from_bzero)
END (__bzero)
weak_alias (__bzero, bzero)
@@ -119,8 +124,7 @@ L(entry_from_bzero):
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
VMOVU %VEC(0), (%rdi)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
#if defined USE_MULTIARCH && IS_IN (libc)
END (MEMSET_SYMBOL (__memset, unaligned))
@@ -143,14 +147,12 @@ ENTRY (__memset_erms)
ENTRY (MEMSET_SYMBOL (__memset, erms))
# endif
L(stosb):
- /* Issue vzeroupper before rep stosb. */
- VZEROUPPER
mov %RDX_LP, %RCX_LP
movzbl %sil, %eax
mov %RDI_LP, %RDX_LP
rep stosb
mov %RDX_LP, %RAX_LP
- ret
+ VZEROUPPER_RETURN
# if VEC_SIZE == 16
END (__memset_erms)
# else
@@ -177,8 +179,7 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
VMOVU %VEC(0), (%rdi)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
L(stosb_more_2x_vec):
cmpq $REP_STOSB_THRESHOLD, %rdx
@@ -192,8 +193,11 @@ L(more_2x_vec):
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
L(return):
- VZEROUPPER
+#if VEC_SIZE > 16
+ ZERO_UPPER_VEC_REGISTERS_RETURN
+#else
ret
+#endif
L(loop_start):
leaq (VEC_SIZE * 4)(%rdi), %rcx
@@ -219,7 +223,6 @@ L(loop):
cmpq %rcx, %rdx
jne L(loop)
VZEROUPPER_SHORT_RETURN
- ret
L(less_vec):
/* Less than 1 VEC. */
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
@@ -233,7 +236,7 @@ L(less_vec):
cmpb $16, %dl
jae L(between_16_31)
# endif
- MOVQ %xmm0, %rcx
+ MOVQ %XMM0, %rcx
cmpb $8, %dl
jae L(between_8_15)
cmpb $4, %dl
@@ -243,40 +246,34 @@ L(less_vec):
jb 1f
movb %cl, (%rdi)
1:
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
# if VEC_SIZE > 32
/* From 32 to 63. No branch when size == 32. */
L(between_32_63):
- vmovdqu %ymm0, -32(%rdi,%rdx)
- vmovdqu %ymm0, (%rdi)
- VZEROUPPER
- ret
+ VMOVU %YMM0, -32(%rdi,%rdx)
+ VMOVU %YMM0, (%rdi)
+ VZEROUPPER_RETURN
# endif
# if VEC_SIZE > 16
/* From 16 to 31. No branch when size == 16. */
L(between_16_31):
- vmovdqu %xmm0, -16(%rdi,%rdx)
- vmovdqu %xmm0, (%rdi)
- VZEROUPPER
- ret
+ VMOVU %XMM0, -16(%rdi,%rdx)
+ VMOVU %XMM0, (%rdi)
+ VZEROUPPER_RETURN
# endif
/* From 8 to 15. No branch when size == 8. */
L(between_8_15):
movq %rcx, -8(%rdi,%rdx)
movq %rcx, (%rdi)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
L(between_4_7):
/* From 4 to 7. No branch when size == 4. */
movl %ecx, -4(%rdi,%rdx)
movl %ecx, (%rdi)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
L(between_2_3):
/* From 2 to 3. No branch when size == 2. */
movw %cx, -2(%rdi,%rdx)
movw %cx, (%rdi)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
END (MEMSET_SYMBOL (__memset, unaligned_erms))
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S
new file mode 100644
index 0000000000..acc5f6e2fb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define MEMCHR __rawmemchr_avx2_rtm
+#define USE_AS_RAWMEMCHR 1
+
+#include "memchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
new file mode 100644
index 0000000000..ec942b77ba
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
@@ -0,0 +1,4 @@
+#define MEMCHR __rawmemchr_evex
+#define USE_AS_RAWMEMCHR 1
+
+#include "memchr-evex.S"
diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
new file mode 100644
index 0000000000..2b9c07a59f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_avx2_rtm
+#include "strcpy-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/stpcpy-evex.S b/sysdeps/x86_64/multiarch/stpcpy-evex.S
new file mode 100644
index 0000000000..7c6f26cd98
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpcpy-evex.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_evex
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
new file mode 100644
index 0000000000..60a2ccfe53
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_avx2_rtm
+#include "strcpy-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S
new file mode 100644
index 0000000000..1570014d1c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_evex
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
new file mode 100644
index 0000000000..637fb557c4
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRCAT
+# define STRCAT __strcat_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strcat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S
index b062356427..aa48c058b9 100644
--- a/sysdeps/x86_64/multiarch/strcat-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcat-avx2.S
@@ -30,7 +30,11 @@
/* Number of bytes in a vector register */
# define VEC_SIZE 32
- .section .text.avx,"ax",@progbits
+# ifndef SECTION
+# define SECTION(p) p##.avx
+# endif
+
+ .section SECTION(.text),"ax",@progbits
ENTRY (STRCAT)
mov %rdi, %r9
# ifdef USE_AS_STRNCAT
diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
new file mode 100644
index 0000000000..97c3d85b6d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-evex.S
@@ -0,0 +1,283 @@
+/* strcat with 256-bit EVEX instructions.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRCAT
+# define STRCAT __strcat_evex
+# endif
+
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
+
+/* zero register */
+# define XMMZERO xmm16
+# define YMMZERO ymm16
+# define YMM0 ymm17
+# define YMM1 ymm18
+
+# define USE_AS_STRCAT
+
+/* Number of bytes in a vector register */
+# define VEC_SIZE 32
+
+ .section .text.evex,"ax",@progbits
+ENTRY (STRCAT)
+ mov %rdi, %r9
+# ifdef USE_AS_STRNCAT
+ mov %rdx, %r8
+# endif
+
+ xor %eax, %eax
+ mov %edi, %ecx
+ and $((VEC_SIZE * 4) - 1), %ecx
+ vpxorq %XMMZERO, %XMMZERO, %XMMZERO
+ cmp $(VEC_SIZE * 3), %ecx
+ ja L(fourth_vector_boundary)
+ vpcmpb $0, (%rdi), %YMMZERO, %k0
+ kmovd %k0, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_first_vector)
+ mov %rdi, %rax
+ and $-VEC_SIZE, %rax
+ jmp L(align_vec_size_start)
+L(fourth_vector_boundary):
+ mov %rdi, %rax
+ and $-VEC_SIZE, %rax
+ vpcmpb $0, (%rax), %YMMZERO, %k0
+ mov $-1, %r10d
+ sub %rax, %rcx
+ shl %cl, %r10d
+ kmovd %k0, %edx
+ and %r10d, %edx
+ jnz L(exit)
+
+L(align_vec_size_start):
+ vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0
+ kmovd %k0, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_second_vector)
+
+ vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+ kmovd %k1, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_third_vector)
+
+ vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+ kmovd %k2, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_fourth_vector)
+
+ vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+ kmovd %k3, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_fifth_vector)
+
+ vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+ add $(VEC_SIZE * 4), %rax
+ kmovd %k4, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_second_vector)
+
+ vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+ kmovd %k1, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_third_vector)
+
+ vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+ kmovd %k2, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_fourth_vector)
+
+ vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+ kmovd %k3, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_fifth_vector)
+
+ vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+ kmovd %k4, %edx
+ add $(VEC_SIZE * 4), %rax
+ test %edx, %edx
+ jnz L(exit_null_on_second_vector)
+
+ vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+ kmovd %k1, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_third_vector)
+
+ vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+ kmovd %k2, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_fourth_vector)
+
+ vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+ kmovd %k3, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_fifth_vector)
+
+ vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+ add $(VEC_SIZE * 4), %rax
+ kmovd %k4, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_second_vector)
+
+ vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+ kmovd %k1, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_third_vector)
+
+ vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+ kmovd %k2, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_fourth_vector)
+
+ vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+ kmovd %k3, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_fifth_vector)
+
+ test $((VEC_SIZE * 4) - 1), %rax
+ jz L(align_four_vec_loop)
+
+ vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+ add $(VEC_SIZE * 5), %rax
+ kmovd %k4, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ test $((VEC_SIZE * 4) - 1), %rax
+ jz L(align_four_vec_loop)
+
+ vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0
+ add $VEC_SIZE, %rax
+ kmovd %k0, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ test $((VEC_SIZE * 4) - 1), %rax
+ jz L(align_four_vec_loop)
+
+ vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0
+ add $VEC_SIZE, %rax
+ kmovd %k0, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ test $((VEC_SIZE * 4) - 1), %rax
+ jz L(align_four_vec_loop)
+
+ vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k1
+ add $VEC_SIZE, %rax
+ kmovd %k1, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ add $VEC_SIZE, %rax
+
+ .p2align 4
+L(align_four_vec_loop):
+ VMOVA (%rax), %YMM0
+ VMOVA (VEC_SIZE * 2)(%rax), %YMM1
+ vpminub VEC_SIZE(%rax), %YMM0, %YMM0
+ vpminub (VEC_SIZE * 3)(%rax), %YMM1, %YMM1
+ vpminub %YMM0, %YMM1, %YMM0
+ /* If K0 != 0, there is a null byte. */
+ vpcmpb $0, %YMM0, %YMMZERO, %k0
+ add $(VEC_SIZE * 4), %rax
+ ktestd %k0, %k0
+ jz L(align_four_vec_loop)
+
+ vpcmpb $0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
+ sub $(VEC_SIZE * 5), %rax
+ kmovd %k0, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_second_vector)
+
+ vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+ kmovd %k1, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_third_vector)
+
+ vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+ kmovd %k2, %edx
+ test %edx, %edx
+ jnz L(exit_null_on_fourth_vector)
+
+ vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+ kmovd %k3, %edx
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $(VEC_SIZE * 4), %rax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit):
+ sub %rdi, %rax
+L(exit_null_on_first_vector):
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_null_on_second_vector):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $VEC_SIZE, %rax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_null_on_third_vector):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $(VEC_SIZE * 2), %rax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_null_on_fourth_vector):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $(VEC_SIZE * 3), %rax
+ jmp L(StartStrcpyPart)
+
+ .p2align 4
+L(exit_null_on_fifth_vector):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $(VEC_SIZE * 4), %rax
+
+ .p2align 4
+L(StartStrcpyPart):
+ lea (%r9, %rax), %rdi
+ mov %rsi, %rcx
+ mov %r9, %rax /* save result */
+
+# ifdef USE_AS_STRNCAT
+ test %r8, %r8
+ jz L(ExitZero)
+# define USE_AS_STRNCPY
+# endif
+
+# include "strcpy-evex.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S
new file mode 100644
index 0000000000..81f20d1d8e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRCHR
+# define STRCHR __strchr_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
index 71b1a1d06e..352074eacb 100644
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -38,9 +38,13 @@
# define VZEROUPPER vzeroupper
# endif
+# ifndef SECTION
+# define SECTION(p) p##.avx
+# endif
+
# define VEC_SIZE 32
- .section .text.avx,"ax",@progbits
+ .section SECTION(.text),"ax",@progbits
ENTRY (STRCHR)
movl %edi, %ecx
/* Broadcast CHAR to YMM0. */
@@ -93,8 +97,8 @@ L(cros_page_boundary):
cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
- VZEROUPPER
- ret
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
L(aligned_more):
@@ -190,8 +194,7 @@ L(first_vec_x0):
cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(first_vec_x1):
@@ -205,8 +208,7 @@ L(first_vec_x1):
cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(first_vec_x2):
@@ -220,8 +222,7 @@ L(first_vec_x2):
cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(4x_vec_end):
@@ -247,8 +248,7 @@ L(first_vec_x3):
cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
END (STRCHR)
#endif
diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
new file mode 100644
index 0000000000..ddc86a7058
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-evex.S
@@ -0,0 +1,335 @@
+/* strchr/strchrnul optimized with 256-bit EVEX instructions.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRCHR
+# define STRCHR __strchr_evex
+# endif
+
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
+
+# ifdef USE_AS_WCSCHR
+# define VPBROADCAST vpbroadcastd
+# define VPCMP vpcmpd
+# define VPMINU vpminud
+# define CHAR_REG esi
+# define SHIFT_REG r8d
+# else
+# define VPBROADCAST vpbroadcastb
+# define VPCMP vpcmpb
+# define VPMINU vpminub
+# define CHAR_REG sil
+# define SHIFT_REG ecx
+# endif
+
+# define XMMZERO xmm16
+
+# define YMMZERO ymm16
+# define YMM0 ymm17
+# define YMM1 ymm18
+# define YMM2 ymm19
+# define YMM3 ymm20
+# define YMM4 ymm21
+# define YMM5 ymm22
+# define YMM6 ymm23
+# define YMM7 ymm24
+# define YMM8 ymm25
+
+# define VEC_SIZE 32
+# define PAGE_SIZE 4096
+
+ .section .text.evex,"ax",@progbits
+ENTRY (STRCHR)
+ movl %edi, %ecx
+# ifndef USE_AS_STRCHRNUL
+ xorl %edx, %edx
+# endif
+
+ /* Broadcast CHAR to YMM0. */
+ VPBROADCAST %esi, %YMM0
+
+ vpxorq %XMMZERO, %XMMZERO, %XMMZERO
+
+ /* Check if we cross page boundary with one vector load. */
+ andl $(PAGE_SIZE - 1), %ecx
+ cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
+ ja L(cross_page_boundary)
+
+ /* Check the first VEC_SIZE bytes. Search for both CHAR and the
+ null bytes. */
+ VMOVU (%rdi), %YMM1
+
+ /* Leaves only CHARS matching esi as 0. */
+ vpxorq %YMM1, %YMM0, %YMM2
+ VPMINU %YMM2, %YMM1, %YMM2
+ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
+ VPCMP $0, %YMMZERO, %YMM2, %k0
+ ktestd %k0, %k0
+ jz L(more_vecs)
+ kmovd %k0, %eax
+ tzcntl %eax, %eax
+ /* Found CHAR or the null byte. */
+# ifdef USE_AS_WCSCHR
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ leaq (%rdi, %rax, 4), %rax
+# else
+ addq %rdi, %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ ret
+
+ .p2align 4
+L(more_vecs):
+ /* Align data for aligned loads in the loop. */
+ andq $-VEC_SIZE, %rdi
+L(aligned_more):
+
+ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
+ since data is only aligned to VEC_SIZE. */
+ VMOVA VEC_SIZE(%rdi), %YMM1
+ addq $VEC_SIZE, %rdi
+
+ /* Leaves only CHARS matching esi as 0. */
+ vpxorq %YMM1, %YMM0, %YMM2
+ VPMINU %YMM2, %YMM1, %YMM2
+ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
+ VPCMP $0, %YMMZERO, %YMM2, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+
+ VMOVA VEC_SIZE(%rdi), %YMM1
+ /* Leaves only CHARS matching esi as 0. */
+ vpxorq %YMM1, %YMM0, %YMM2
+ VPMINU %YMM2, %YMM1, %YMM2
+ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
+ VPCMP $0, %YMMZERO, %YMM2, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+
+ VMOVA (VEC_SIZE * 2)(%rdi), %YMM1
+ /* Leaves only CHARS matching esi as 0. */
+ vpxorq %YMM1, %YMM0, %YMM2
+ VPMINU %YMM2, %YMM1, %YMM2
+ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
+ VPCMP $0, %YMMZERO, %YMM2, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x2)
+
+ VMOVA (VEC_SIZE * 3)(%rdi), %YMM1
+ /* Leaves only CHARS matching esi as 0. */
+ vpxorq %YMM1, %YMM0, %YMM2
+ VPMINU %YMM2, %YMM1, %YMM2
+ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
+ VPCMP $0, %YMMZERO, %YMM2, %k0
+ ktestd %k0, %k0
+ jz L(prep_loop_4x)
+
+ kmovd %k0, %eax
+ tzcntl %eax, %eax
+ /* Found CHAR or the null byte. */
+# ifdef USE_AS_WCSCHR
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
+# else
+ leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ ret
+
+ .p2align 4
+L(first_vec_x0):
+ tzcntl %eax, %eax
+ /* Found CHAR or the null byte. */
+# ifdef USE_AS_WCSCHR
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ leaq (%rdi, %rax, 4), %rax
+# else
+ addq %rdi, %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ ret
+
+ .p2align 4
+L(first_vec_x1):
+ tzcntl %eax, %eax
+ /* Found CHAR or the null byte. */
+# ifdef USE_AS_WCSCHR
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ leaq VEC_SIZE(%rdi, %rax, 4), %rax
+# else
+ leaq VEC_SIZE(%rdi, %rax), %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ ret
+
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %eax
+ /* Found CHAR or the null byte. */
+# ifdef USE_AS_WCSCHR
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
+# else
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ ret
+
+L(prep_loop_4x):
+ /* Align data to 4 * VEC_SIZE. */
+ andq $-(VEC_SIZE * 4), %rdi
+
+ .p2align 4
+L(loop_4x_vec):
+ /* Compare 4 * VEC at a time forward. */
+ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
+ VMOVA (VEC_SIZE * 5)(%rdi), %YMM2
+ VMOVA (VEC_SIZE * 6)(%rdi), %YMM3
+ VMOVA (VEC_SIZE * 7)(%rdi), %YMM4
+
+ /* Leaves only CHARS matching esi as 0. */
+ vpxorq %YMM1, %YMM0, %YMM5
+ vpxorq %YMM2, %YMM0, %YMM6
+ vpxorq %YMM3, %YMM0, %YMM7
+ vpxorq %YMM4, %YMM0, %YMM8
+
+ VPMINU %YMM5, %YMM1, %YMM5
+ VPMINU %YMM6, %YMM2, %YMM6
+ VPMINU %YMM7, %YMM3, %YMM7
+ VPMINU %YMM8, %YMM4, %YMM8
+
+ VPMINU %YMM5, %YMM6, %YMM1
+ VPMINU %YMM7, %YMM8, %YMM2
+
+ VPMINU %YMM1, %YMM2, %YMM1
+
+ /* Each bit in K0 represents a CHAR or a null byte. */
+ VPCMP $0, %YMMZERO, %YMM1, %k0
+
+ addq $(VEC_SIZE * 4), %rdi
+
+ ktestd %k0, %k0
+ jz L(loop_4x_vec)
+
+ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
+ VPCMP $0, %YMMZERO, %YMM5, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+
+ /* Each bit in K1 represents a CHAR or a null byte in YMM2. */
+ VPCMP $0, %YMMZERO, %YMM6, %k1
+ kmovd %k1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+
+ /* Each bit in K2 represents a CHAR or a null byte in YMM3. */
+ VPCMP $0, %YMMZERO, %YMM7, %k2
+ /* Each bit in K3 represents a CHAR or a null byte in YMM4. */
+ VPCMP $0, %YMMZERO, %YMM8, %k3
+
+# ifdef USE_AS_WCSCHR
+ /* NB: Each bit in K2/K3 represents 4-byte element. */
+ kshiftlw $8, %k3, %k1
+# else
+ kshiftlq $32, %k3, %k1
+# endif
+
+ /* Each bit in K1 represents a NULL or a mismatch. */
+ korq %k1, %k2, %k1
+ kmovq %k1, %rax
+
+ tzcntq %rax, %rax
+# ifdef USE_AS_WCSCHR
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
+# else
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ ret
+
+ /* Cold case for crossing page with first load. */
+ .p2align 4
+L(cross_page_boundary):
+ andq $-VEC_SIZE, %rdi
+ andl $(VEC_SIZE - 1), %ecx
+
+ VMOVA (%rdi), %YMM1
+
+ /* Leaves only CHARS matching esi as 0. */
+ vpxorq %YMM1, %YMM0, %YMM2
+ VPMINU %YMM2, %YMM1, %YMM2
+ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
+ VPCMP $0, %YMMZERO, %YMM2, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+
+# ifdef USE_AS_WCSCHR
+ /* NB: Divide shift count by 4 since each bit in K1 represent 4
+ bytes. */
+ movl %ecx, %SHIFT_REG
+ sarl $2, %SHIFT_REG
+# endif
+
+ /* Remove the leading bits. */
+ sarxl %SHIFT_REG, %eax, %eax
+ testl %eax, %eax
+
+ jz L(aligned_more)
+ tzcntl %eax, %eax
+ addq %rcx, %rdi
+# ifdef USE_AS_WCSCHR
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ leaq (%rdi, %rax, 4), %rax
+# else
+ addq %rdi, %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ ret
+
+END (STRCHR)
+# endif
diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
index 5bc6eb3339..bcecec99e5 100644
--- a/sysdeps/x86_64/multiarch/strchr.c
+++ b/sysdeps/x86_64/multiarch/strchr.c
@@ -29,16 +29,28 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
- && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+ if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
- return OPTIMIZE (avx2);
+ {
+ if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable)
+ && CPU_FEATURES_CPU_P (cpu_features, BMI2))
+ return OPTIMIZE (evex);
+
+ if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ return OPTIMIZE (avx2_rtm);
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ return OPTIMIZE (avx2);
+ }
if (CPU_FEATURES_ARCH_P (cpu_features, Slow_BSF))
return OPTIMIZE (sse2_no_bsf);
diff --git a/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S
new file mode 100644
index 0000000000..cdcf818b91
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define STRCHR __strchrnul_avx2_rtm
+#define USE_AS_STRCHRNUL 1
+#include "strchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex.S b/sysdeps/x86_64/multiarch/strchrnul-evex.S
new file mode 100644
index 0000000000..064fe7ca9e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchrnul-evex.S
@@ -0,0 +1,3 @@
+#define STRCHR __strchrnul_evex
+#define USE_AS_STRCHRNUL 1
+#include "strchr-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S
new file mode 100644
index 0000000000..aecd30d97f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRCMP
+# define STRCMP __strcmp_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index c9f246415f..f199c91f7f 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -55,6 +55,10 @@
# define VZEROUPPER vzeroupper
# endif
+# ifndef SECTION
+# define SECTION(p) p##.avx
+# endif
+
/* Warning!
wcscmp/wcsncmp have to use SIGNED comparison for elements.
strcmp/strncmp have to use UNSIGNED comparison for elements.
@@ -75,7 +79,7 @@
the maximum offset is reached before a difference is found, zero is
returned. */
- .section .text.avx,"ax",@progbits
+ .section SECTION(.text),"ax",@progbits
ENTRY (STRCMP)
# ifdef USE_AS_STRNCMP
/* Check for simple cases (0 or 1) in offset. */
@@ -83,6 +87,16 @@ ENTRY (STRCMP)
je L(char0)
jb L(zero)
# ifdef USE_AS_WCSCMP
+# ifndef __ILP32__
+ movq %rdx, %rcx
+ /* Check if length could overflow when multiplied by
+ sizeof(wchar_t). Checking top 8 bits will cover all potential
+ overflow cases as well as redirect cases where its impossible to
+ length to bound a valid memory region. In these cases just use
+ 'wcscmp'. */
+ shrq $56, %rcx
+ jnz __wcscmp_avx2
+# endif
/* Convert units: from wide to byte char. */
shl $2, %RDX_LP
# endif
@@ -127,8 +141,8 @@ L(return):
movzbl (%rsi, %rdx), %edx
subl %edx, %eax
# endif
- VZEROUPPER
- ret
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
L(return_vec_size):
@@ -161,8 +175,7 @@ L(return_vec_size):
subl %edx, %eax
# endif
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(return_2_vec_size):
@@ -195,8 +208,7 @@ L(return_2_vec_size):
subl %edx, %eax
# endif
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(return_3_vec_size):
@@ -229,8 +241,7 @@ L(return_3_vec_size):
subl %edx, %eax
# endif
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(next_3_vectors):
@@ -356,8 +367,7 @@ L(back_to_loop):
subl %edx, %eax
# endif
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(test_vec):
@@ -400,8 +410,7 @@ L(test_vec):
subl %edx, %eax
# endif
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(test_2_vec):
@@ -444,8 +453,7 @@ L(test_2_vec):
subl %edx, %eax
# endif
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(test_3_vec):
@@ -486,8 +494,7 @@ L(test_3_vec):
subl %edx, %eax
# endif
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(loop_cross_page):
@@ -556,8 +563,7 @@ L(loop_cross_page):
subl %edx, %eax
# endif
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(loop_cross_page_2_vec):
@@ -631,8 +637,7 @@ L(loop_cross_page_2_vec):
subl %edx, %eax
# endif
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
# ifdef USE_AS_STRNCMP
L(string_nbyte_offset_check):
@@ -674,8 +679,7 @@ L(cross_page_loop):
# ifndef USE_AS_WCSCMP
L(different):
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
# ifdef USE_AS_WCSCMP
.p2align 4
@@ -685,16 +689,14 @@ L(different):
setl %al
negl %eax
orl $1, %eax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
# endif
# ifdef USE_AS_STRNCMP
.p2align 4
L(zero):
xorl %eax, %eax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(char0):
@@ -708,8 +710,7 @@ L(char0):
movzbl (%rdi), %eax
subl %ecx, %eax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
# endif
.p2align 4
@@ -734,8 +735,7 @@ L(last_vector):
movzbl (%rsi, %rdx), %edx
subl %edx, %eax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
/* Comparing on page boundary region requires special treatment:
It must done one vector at the time, starting with the wider
@@ -856,7 +856,6 @@ L(cross_page_4bytes):
testl %eax, %eax
jne L(cross_page_loop)
subl %ecx, %eax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
END (STRCMP)
#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
new file mode 100644
index 0000000000..459eeed09f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -0,0 +1,1043 @@
+/* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRCMP
+# define STRCMP __strcmp_evex
+# endif
+
+# define PAGE_SIZE 4096
+
+/* VEC_SIZE = Number of bytes in a ymm register */
+# define VEC_SIZE 32
+
+/* Shift for dividing by (VEC_SIZE * 4). */
+# define DIVIDE_BY_VEC_4_SHIFT 7
+# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
+# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
+# endif
+
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
+
+# ifdef USE_AS_WCSCMP
+/* Compare packed dwords. */
+# define VPCMP vpcmpd
+# define SHIFT_REG32 r8d
+# define SHIFT_REG64 r8
+/* 1 dword char == 4 bytes. */
+# define SIZE_OF_CHAR 4
+# else
+/* Compare packed bytes. */
+# define VPCMP vpcmpb
+# define SHIFT_REG32 ecx
+# define SHIFT_REG64 rcx
+/* 1 byte char == 1 byte. */
+# define SIZE_OF_CHAR 1
+# endif
+
+# define XMMZERO xmm16
+# define XMM0 xmm17
+# define XMM1 xmm18
+
+# define YMMZERO ymm16
+# define YMM0 ymm17
+# define YMM1 ymm18
+# define YMM2 ymm19
+# define YMM3 ymm20
+# define YMM4 ymm21
+# define YMM5 ymm22
+# define YMM6 ymm23
+# define YMM7 ymm24
+
+/* Warning!
+ wcscmp/wcsncmp have to use SIGNED comparison for elements.
+ strcmp/strncmp have to use UNSIGNED comparison for elements.
+*/
+
+/* The main idea of the string comparison (byte or dword) using 256-bit
+ EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
+ latter can be on either packed bytes or dwords depending on
+ USE_AS_WCSCMP. In order to check the null char, algorithm keeps the
+ matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
+ KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
+ are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
+ instructions. Main loop (away from from page boundary) compares 4
+ vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128
+ bytes) on each loop.
+
+ The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
+ is the same as strcmp, except that an a maximum offset is tracked. If
+ the maximum offset is reached before a difference is found, zero is
+ returned. */
+
+ .section .text.evex,"ax",@progbits
+ENTRY (STRCMP)
+# ifdef USE_AS_STRNCMP
+ /* Check for simple cases (0 or 1) in offset. */
+ cmp $1, %RDX_LP
+ je L(char0)
+ jb L(zero)
+# ifdef USE_AS_WCSCMP
+ /* Convert units: from wide to byte char. */
+ shl $2, %RDX_LP
+# endif
+ /* Register %r11 tracks the maximum offset. */
+ mov %RDX_LP, %R11_LP
+# endif
+ movl %edi, %eax
+ xorl %edx, %edx
+ /* Make %XMMZERO (%YMMZERO) all zeros in this function. */
+ vpxorq %XMMZERO, %XMMZERO, %XMMZERO
+ orl %esi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax
+ jg L(cross_page)
+ /* Start comparing 4 vectors. */
+ VMOVU (%rdi), %YMM0
+ VMOVU (%rsi), %YMM1
+
+ /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
+ VPCMP $4, %YMM0, %YMM1, %k0
+
+ /* Check for NULL in YMM0. */
+ VPCMP $0, %YMMZERO, %YMM0, %k1
+ /* Check for NULL in YMM1. */
+ VPCMP $0, %YMMZERO, %YMM1, %k2
+ /* Each bit in K1 represents a NULL in YMM0 or YMM1. */
+ kord %k1, %k2, %k1
+
+ /* Each bit in K1 represents:
+ 1. A mismatch in YMM0 and YMM1. Or
+ 2. A NULL in YMM0 or YMM1.
+ */
+ kord %k0, %k1, %k1
+
+ ktestd %k1, %k1
+ je L(next_3_vectors)
+ kmovd %k1, %ecx
+ tzcntl %ecx, %edx
+# ifdef USE_AS_WCSCMP
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ sall $2, %edx
+# endif
+# ifdef USE_AS_STRNCMP
+ /* Return 0 if the mismatched index (%rdx) is after the maximum
+ offset (%r11). */
+ cmpq %r11, %rdx
+ jae L(zero)
+# endif
+# ifdef USE_AS_WCSCMP
+ xorl %eax, %eax
+ movl (%rdi, %rdx), %ecx
+ cmpl (%rsi, %rdx), %ecx
+ je L(return)
+L(wcscmp_return):
+ setl %al
+ negl %eax
+ orl $1, %eax
+L(return):
+# else
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %edx
+ subl %edx, %eax
+# endif
+ ret
+
+ .p2align 4
+L(return_vec_size):
+ kmovd %k1, %ecx
+ tzcntl %ecx, %edx
+# ifdef USE_AS_WCSCMP
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ sall $2, %edx
+# endif
+# ifdef USE_AS_STRNCMP
+ /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
+ the maximum offset (%r11). */
+ addq $VEC_SIZE, %rdx
+ cmpq %r11, %rdx
+ jae L(zero)
+# ifdef USE_AS_WCSCMP
+ xorl %eax, %eax
+ movl (%rdi, %rdx), %ecx
+ cmpl (%rsi, %rdx), %ecx
+ jne L(wcscmp_return)
+# else
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %edx
+ subl %edx, %eax
+# endif
+# else
+# ifdef USE_AS_WCSCMP
+ xorl %eax, %eax
+ movl VEC_SIZE(%rdi, %rdx), %ecx
+ cmpl VEC_SIZE(%rsi, %rdx), %ecx
+ jne L(wcscmp_return)
+# else
+ movzbl VEC_SIZE(%rdi, %rdx), %eax
+ movzbl VEC_SIZE(%rsi, %rdx), %edx
+ subl %edx, %eax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(return_2_vec_size):
+ kmovd %k1, %ecx
+ tzcntl %ecx, %edx
+# ifdef USE_AS_WCSCMP
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ sall $2, %edx
+# endif
+# ifdef USE_AS_STRNCMP
+ /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
+ after the maximum offset (%r11). */
+ addq $(VEC_SIZE * 2), %rdx
+ cmpq %r11, %rdx
+ jae L(zero)
+# ifdef USE_AS_WCSCMP
+ xorl %eax, %eax
+ movl (%rdi, %rdx), %ecx
+ cmpl (%rsi, %rdx), %ecx
+ jne L(wcscmp_return)
+# else
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %edx
+ subl %edx, %eax
+# endif
+# else
+# ifdef USE_AS_WCSCMP
+ xorl %eax, %eax
+ movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx
+ cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx
+ jne L(wcscmp_return)
+# else
+ movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax
+ movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx
+ subl %edx, %eax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(return_3_vec_size):
+ kmovd %k1, %ecx
+ tzcntl %ecx, %edx
+# ifdef USE_AS_WCSCMP
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ sall $2, %edx
+# endif
+# ifdef USE_AS_STRNCMP
+ /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
+ after the maximum offset (%r11). */
+ addq $(VEC_SIZE * 3), %rdx
+ cmpq %r11, %rdx
+ jae L(zero)
+# ifdef USE_AS_WCSCMP
+ xorl %eax, %eax
+ movl (%rdi, %rdx), %ecx
+ cmpl (%rsi, %rdx), %ecx
+ jne L(wcscmp_return)
+# else
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %edx
+ subl %edx, %eax
+# endif
+# else
+# ifdef USE_AS_WCSCMP
+ xorl %eax, %eax
+ movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx
+ cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx
+ jne L(wcscmp_return)
+# else
+ movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax
+ movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx
+ subl %edx, %eax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(next_3_vectors):
+ VMOVU VEC_SIZE(%rdi), %YMM0
+ VMOVU VEC_SIZE(%rsi), %YMM1
+ /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
+ VPCMP $4, %YMM0, %YMM1, %k0
+ VPCMP $0, %YMMZERO, %YMM0, %k1
+ VPCMP $0, %YMMZERO, %YMM1, %k2
+ /* Each bit in K1 represents a NULL in YMM0 or YMM1. */
+ kord %k1, %k2, %k1
+ /* Each bit in K1 represents a NULL or a mismatch. */
+ kord %k0, %k1, %k1
+ ktestd %k1, %k1
+ jne L(return_vec_size)
+
+ VMOVU (VEC_SIZE * 2)(%rdi), %YMM2
+ VMOVU (VEC_SIZE * 3)(%rdi), %YMM3
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM4
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM5
+
+ /* Each bit in K0 represents a mismatch in YMM2 and YMM4. */
+ VPCMP $4, %YMM2, %YMM4, %k0
+ VPCMP $0, %YMMZERO, %YMM2, %k1
+ VPCMP $0, %YMMZERO, %YMM4, %k2
+ /* Each bit in K1 represents a NULL in YMM2 or YMM4. */
+ kord %k1, %k2, %k1
+ /* Each bit in K1 represents a NULL or a mismatch. */
+ kord %k0, %k1, %k1
+ ktestd %k1, %k1
+ jne L(return_2_vec_size)
+
+ /* Each bit in K0 represents a mismatch in YMM3 and YMM5. */
+ VPCMP $4, %YMM3, %YMM5, %k0
+ VPCMP $0, %YMMZERO, %YMM3, %k1
+ VPCMP $0, %YMMZERO, %YMM5, %k2
+ /* Each bit in K1 represents a NULL in YMM3 or YMM5. */
+ kord %k1, %k2, %k1
+ /* Each bit in K1 represents a NULL or a mismatch. */
+ kord %k0, %k1, %k1
+ ktestd %k1, %k1
+ jne L(return_3_vec_size)
+L(main_loop_header):
+ leaq (VEC_SIZE * 4)(%rdi), %rdx
+ movl $PAGE_SIZE, %ecx
+ /* Align load via RAX. */
+ andq $-(VEC_SIZE * 4), %rdx
+ subq %rdi, %rdx
+ leaq (%rdi, %rdx), %rax
+# ifdef USE_AS_STRNCMP
+ /* Starting from this point, the maximum offset, or simply the
+ 'offset', DECREASES by the same amount when base pointers are
+ moved forward. Return 0 when:
+ 1) On match: offset <= the matched vector index.
+ 2) On mistmach, offset is before the mistmatched index.
+ */
+ subq %rdx, %r11
+ jbe L(zero)
+# endif
+ addq %rsi, %rdx
+ movq %rdx, %rsi
+ andl $(PAGE_SIZE - 1), %esi
+ /* Number of bytes before page crossing. */
+ subq %rsi, %rcx
+ /* Number of VEC_SIZE * 4 blocks before page crossing. */
+ shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx
+ /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */
+ movl %ecx, %esi
+ jmp L(loop_start)
+
+ .p2align 4
+L(loop):
+# ifdef USE_AS_STRNCMP
+ /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease
+ the maximum offset (%r11) by the same amount. */
+ subq $(VEC_SIZE * 4), %r11
+ jbe L(zero)
+# endif
+ addq $(VEC_SIZE * 4), %rax
+ addq $(VEC_SIZE * 4), %rdx
+L(loop_start):
+ testl %esi, %esi
+ leal -1(%esi), %esi
+ je L(loop_cross_page)
+L(back_to_loop):
+ /* Main loop, comparing 4 vectors are a time. */
+ VMOVA (%rax), %YMM0
+ VMOVA VEC_SIZE(%rax), %YMM2
+ VMOVA (VEC_SIZE * 2)(%rax), %YMM4
+ VMOVA (VEC_SIZE * 3)(%rax), %YMM6
+ VMOVU (%rdx), %YMM1
+ VMOVU VEC_SIZE(%rdx), %YMM3
+ VMOVU (VEC_SIZE * 2)(%rdx), %YMM5
+ VMOVU (VEC_SIZE * 3)(%rdx), %YMM7
+
+ VPCMP $4, %YMM0, %YMM1, %k0
+ VPCMP $0, %YMMZERO, %YMM0, %k1
+ VPCMP $0, %YMMZERO, %YMM1, %k2
+ kord %k1, %k2, %k1
+ /* Each bit in K4 represents a NULL or a mismatch in YMM0 and
+ YMM1. */
+ kord %k0, %k1, %k4
+
+ VPCMP $4, %YMM2, %YMM3, %k0
+ VPCMP $0, %YMMZERO, %YMM2, %k1
+ VPCMP $0, %YMMZERO, %YMM3, %k2
+ kord %k1, %k2, %k1
+ /* Each bit in K5 represents a NULL or a mismatch in YMM2 and
+ YMM3. */
+ kord %k0, %k1, %k5
+
+ VPCMP $4, %YMM4, %YMM5, %k0
+ VPCMP $0, %YMMZERO, %YMM4, %k1
+ VPCMP $0, %YMMZERO, %YMM5, %k2
+ kord %k1, %k2, %k1
+ /* Each bit in K6 represents a NULL or a mismatch in YMM4 and
+ YMM5. */
+ kord %k0, %k1, %k6
+
+ VPCMP $4, %YMM6, %YMM7, %k0
+ VPCMP $0, %YMMZERO, %YMM6, %k1
+ VPCMP $0, %YMMZERO, %YMM7, %k2
+ kord %k1, %k2, %k1
+ /* Each bit in K7 represents a NULL or a mismatch in YMM6 and
+ YMM7. */
+ kord %k0, %k1, %k7
+
+ kord %k4, %k5, %k0
+ kord %k6, %k7, %k1
+
+ /* Test each mask (32 bits) individually because for VEC_SIZE
+ == 32 is not possible to OR the four masks and keep all bits
+ in a 64-bit integer register, differing from SSE2 strcmp
+ where ORing is possible. */
+ kortestd %k0, %k1
+ je L(loop)
+ ktestd %k4, %k4
+ je L(test_vec)
+ kmovd %k4, %edi
+ tzcntl %edi, %ecx
+# ifdef USE_AS_WCSCMP
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ sall $2, %ecx
+# endif
+# ifdef USE_AS_STRNCMP
+ cmpq %rcx, %r11
+ jbe L(zero)
+# ifdef USE_AS_WCSCMP
+ movq %rax, %rsi
+ xorl %eax, %eax
+ movl (%rsi, %rcx), %edi
+ cmpl (%rdx, %rcx), %edi
+ jne L(wcscmp_return)
+# else
+ movzbl (%rax, %rcx), %eax
+ movzbl (%rdx, %rcx), %edx
+ subl %edx, %eax
+# endif
+# else
+# ifdef USE_AS_WCSCMP
+ movq %rax, %rsi
+ xorl %eax, %eax
+ movl (%rsi, %rcx), %edi
+ cmpl (%rdx, %rcx), %edi
+ jne L(wcscmp_return)
+# else
+ movzbl (%rax, %rcx), %eax
+ movzbl (%rdx, %rcx), %edx
+ subl %edx, %eax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(test_vec):
+# ifdef USE_AS_STRNCMP
+ /* The first vector matched. Return 0 if the maximum offset
+ (%r11) <= VEC_SIZE. */
+ cmpq $VEC_SIZE, %r11
+ jbe L(zero)
+# endif
+ ktestd %k5, %k5
+ je L(test_2_vec)
+ kmovd %k5, %ecx
+ tzcntl %ecx, %edi
+# ifdef USE_AS_WCSCMP
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ sall $2, %edi
+# endif
+# ifdef USE_AS_STRNCMP
+ addq $VEC_SIZE, %rdi
+ cmpq %rdi, %r11
+ jbe L(zero)
+# ifdef USE_AS_WCSCMP
+ movq %rax, %rsi
+ xorl %eax, %eax
+ movl (%rsi, %rdi), %ecx
+ cmpl (%rdx, %rdi), %ecx
+ jne L(wcscmp_return)
+# else
+ movzbl (%rax, %rdi), %eax
+ movzbl (%rdx, %rdi), %edx
+ subl %edx, %eax
+# endif
+# else
+# ifdef USE_AS_WCSCMP
+ movq %rax, %rsi
+ xorl %eax, %eax
+ movl VEC_SIZE(%rsi, %rdi), %ecx
+ cmpl VEC_SIZE(%rdx, %rdi), %ecx
+ jne L(wcscmp_return)
+# else
+ movzbl VEC_SIZE(%rax, %rdi), %eax
+ movzbl VEC_SIZE(%rdx, %rdi), %edx
+ subl %edx, %eax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(test_2_vec):
+# ifdef USE_AS_STRNCMP
+ /* The first 2 vectors matched. Return 0 if the maximum offset
+ (%r11) <= 2 * VEC_SIZE. */
+ cmpq $(VEC_SIZE * 2), %r11
+ jbe L(zero)
+# endif
+ ktestd %k6, %k6
+ je L(test_3_vec)
+ kmovd %k6, %ecx
+ tzcntl %ecx, %edi
+# ifdef USE_AS_WCSCMP
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ sall $2, %edi
+# endif
+# ifdef USE_AS_STRNCMP
+ addq $(VEC_SIZE * 2), %rdi
+ cmpq %rdi, %r11
+ jbe L(zero)
+# ifdef USE_AS_WCSCMP
+ movq %rax, %rsi
+ xorl %eax, %eax
+ movl (%rsi, %rdi), %ecx
+ cmpl (%rdx, %rdi), %ecx
+ jne L(wcscmp_return)
+# else
+ movzbl (%rax, %rdi), %eax
+ movzbl (%rdx, %rdi), %edx
+ subl %edx, %eax
+# endif
+# else
+# ifdef USE_AS_WCSCMP
+ movq %rax, %rsi
+ xorl %eax, %eax
+ movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx
+ cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx
+ jne L(wcscmp_return)
+# else
+ movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax
+ movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx
+ subl %edx, %eax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(test_3_vec):
+# ifdef USE_AS_STRNCMP
+ /* The first 3 vectors matched. Return 0 if the maximum offset
+ (%r11) <= 3 * VEC_SIZE. */
+ cmpq $(VEC_SIZE * 3), %r11
+ jbe L(zero)
+# endif
+ kmovd %k7, %esi
+ tzcntl %esi, %ecx
+# ifdef USE_AS_WCSCMP
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ sall $2, %ecx
+# endif
+# ifdef USE_AS_STRNCMP
+ addq $(VEC_SIZE * 3), %rcx
+ cmpq %rcx, %r11
+ jbe L(zero)
+# ifdef USE_AS_WCSCMP
+ movq %rax, %rsi
+ xorl %eax, %eax
+ movl (%rsi, %rcx), %esi
+ cmpl (%rdx, %rcx), %esi
+ jne L(wcscmp_return)
+# else
+ movzbl (%rax, %rcx), %eax
+ movzbl (%rdx, %rcx), %edx
+ subl %edx, %eax
+# endif
+# else
+# ifdef USE_AS_WCSCMP
+ movq %rax, %rsi
+ xorl %eax, %eax
+ movl (VEC_SIZE * 3)(%rsi, %rcx), %esi
+ cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi
+ jne L(wcscmp_return)
+# else
+ movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax
+ movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx
+ subl %edx, %eax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(loop_cross_page):
+ xorl %r10d, %r10d
+ movq %rdx, %rcx
+ /* Align load via RDX. We load the extra ECX bytes which should
+ be ignored. */
+ andl $((VEC_SIZE * 4) - 1), %ecx
+ /* R10 is -RCX. */
+ subq %rcx, %r10
+
+ /* This works only if VEC_SIZE * 2 == 64. */
+# if (VEC_SIZE * 2) != 64
+# error (VEC_SIZE * 2) != 64
+# endif
+
+ /* Check if the first VEC_SIZE * 2 bytes should be ignored. */
+ cmpl $(VEC_SIZE * 2), %ecx
+ jge L(loop_cross_page_2_vec)
+
+ VMOVU (%rax, %r10), %YMM2
+ VMOVU VEC_SIZE(%rax, %r10), %YMM3
+ VMOVU (%rdx, %r10), %YMM4
+ VMOVU VEC_SIZE(%rdx, %r10), %YMM5
+
+ VPCMP $4, %YMM4, %YMM2, %k0
+ VPCMP $0, %YMMZERO, %YMM2, %k1
+ VPCMP $0, %YMMZERO, %YMM4, %k2
+ kord %k1, %k2, %k1
+ /* Each bit in K1 represents a NULL or a mismatch in YMM2 and
+ YMM4. */
+ kord %k0, %k1, %k1
+
+ VPCMP $4, %YMM5, %YMM3, %k3
+ VPCMP $0, %YMMZERO, %YMM3, %k4
+ VPCMP $0, %YMMZERO, %YMM5, %k5
+ kord %k4, %k5, %k4
+ /* Each bit in K3 represents a NULL or a mismatch in YMM3 and
+ YMM5. */
+ kord %k3, %k4, %k3
+
+# ifdef USE_AS_WCSCMP
+ /* NB: Each bit in K1/K3 represents 4-byte element. */
+ kshiftlw $8, %k3, %k2
+ /* NB: Divide shift count by 4 since each bit in K1 represent 4
+ bytes. */
+ movl %ecx, %SHIFT_REG32
+ sarl $2, %SHIFT_REG32
+# else
+ kshiftlq $32, %k3, %k2
+# endif
+
+ /* Each bit in K1 represents a NULL or a mismatch. */
+ korq %k1, %k2, %k1
+ kmovq %k1, %rdi
+
+ /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */
+ shrxq %SHIFT_REG64, %rdi, %rdi
+ testq %rdi, %rdi
+ je L(loop_cross_page_2_vec)
+ tzcntq %rdi, %rcx
+# ifdef USE_AS_WCSCMP
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ sall $2, %ecx
+# endif
+# ifdef USE_AS_STRNCMP
+ cmpq %rcx, %r11
+ jbe L(zero)
+# ifdef USE_AS_WCSCMP
+ movq %rax, %rsi
+ xorl %eax, %eax
+ movl (%rsi, %rcx), %edi
+ cmpl (%rdx, %rcx), %edi
+ jne L(wcscmp_return)
+# else
+ movzbl (%rax, %rcx), %eax
+ movzbl (%rdx, %rcx), %edx
+ subl %edx, %eax
+# endif
+# else
+# ifdef USE_AS_WCSCMP
+ movq %rax, %rsi
+ xorl %eax, %eax
+ movl (%rsi, %rcx), %edi
+ cmpl (%rdx, %rcx), %edi
+ jne L(wcscmp_return)
+# else
+ movzbl (%rax, %rcx), %eax
+ movzbl (%rdx, %rcx), %edx
+ subl %edx, %eax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(loop_cross_page_2_vec):
+ /* The first VEC_SIZE * 2 bytes match or are ignored. */
+ VMOVU (VEC_SIZE * 2)(%rax, %r10), %YMM0
+ VMOVU (VEC_SIZE * 3)(%rax, %r10), %YMM1
+ VMOVU (VEC_SIZE * 2)(%rdx, %r10), %YMM2
+ VMOVU (VEC_SIZE * 3)(%rdx, %r10), %YMM3
+
+ VPCMP $4, %YMM0, %YMM2, %k0
+ VPCMP $0, %YMMZERO, %YMM0, %k1
+ VPCMP $0, %YMMZERO, %YMM2, %k2
+ kord %k1, %k2, %k1
+ /* Each bit in K1 represents a NULL or a mismatch in YMM0 and
+ YMM2. */
+ kord %k0, %k1, %k1
+
+ VPCMP $4, %YMM1, %YMM3, %k3
+ VPCMP $0, %YMMZERO, %YMM1, %k4
+ VPCMP $0, %YMMZERO, %YMM3, %k5
+ kord %k4, %k5, %k4
+ /* Each bit in K3 represents a NULL or a mismatch in YMM1 and
+ YMM3. */
+ kord %k3, %k4, %k3
+
+# ifdef USE_AS_WCSCMP
+ /* NB: Each bit in K1/K3 represents 4-byte element. */
+ kshiftlw $8, %k3, %k2
+# else
+ kshiftlq $32, %k3, %k2
+# endif
+
+ /* Each bit in K1 represents a NULL or a mismatch. */
+ korq %k1, %k2, %k1
+ kmovq %k1, %rdi
+
+ xorl %r8d, %r8d
+ /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */
+ subl $(VEC_SIZE * 2), %ecx
+ jle 1f
+ /* R8 has number of bytes skipped. */
+ movl %ecx, %r8d
+# ifdef USE_AS_WCSCMP
+ /* NB: Divide shift count by 4 since each bit in K1 represent 4
+ bytes. */
+ sarl $2, %ecx
+# endif
+ /* Skip ECX bytes. */
+ shrq %cl, %rdi
+1:
+ /* Before jumping back to the loop, set ESI to the number of
+ VEC_SIZE * 4 blocks before page crossing. */
+ movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
+
+ testq %rdi, %rdi
+# ifdef USE_AS_STRNCMP
+ /* At this point, if %rdi value is 0, it already tested
+ VEC_SIZE*4+%r10 byte starting from %rax. This label
+ checks whether strncmp maximum offset reached or not. */
+ je L(string_nbyte_offset_check)
+# else
+ je L(back_to_loop)
+# endif
+ tzcntq %rdi, %rcx
+# ifdef USE_AS_WCSCMP
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ sall $2, %ecx
+# endif
+ addq %r10, %rcx
+ /* Adjust for number of bytes skipped. */
+ addq %r8, %rcx
+# ifdef USE_AS_STRNCMP
+ addq $(VEC_SIZE * 2), %rcx
+ subq %rcx, %r11
+ jbe L(zero)
+# ifdef USE_AS_WCSCMP
+ movq %rax, %rsi
+ xorl %eax, %eax
+ movl (%rsi, %rcx), %edi
+ cmpl (%rdx, %rcx), %edi
+ jne L(wcscmp_return)
+# else
+ movzbl (%rax, %rcx), %eax
+ movzbl (%rdx, %rcx), %edx
+ subl %edx, %eax
+# endif
+# else
+# ifdef USE_AS_WCSCMP
+ movq %rax, %rsi
+ xorl %eax, %eax
+ movl (VEC_SIZE * 2)(%rsi, %rcx), %edi
+ cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi
+ jne L(wcscmp_return)
+# else
+ movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax
+ movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx
+ subl %edx, %eax
+# endif
+# endif
+ ret
+
+# ifdef USE_AS_STRNCMP
+L(string_nbyte_offset_check):
+ leaq (VEC_SIZE * 4)(%r10), %r10
+ cmpq %r10, %r11
+ jbe L(zero)
+ jmp L(back_to_loop)
+# endif
+
+ .p2align 4
+L(cross_page_loop):
+ /* Check one byte/dword at a time. */
+# ifdef USE_AS_WCSCMP
+ cmpl %ecx, %eax
+# else
+ subl %ecx, %eax
+# endif
+ jne L(different)
+ addl $SIZE_OF_CHAR, %edx
+ cmpl $(VEC_SIZE * 4), %edx
+ je L(main_loop_header)
+# ifdef USE_AS_STRNCMP
+ cmpq %r11, %rdx
+ jae L(zero)
+# endif
+# ifdef USE_AS_WCSCMP
+ movl (%rdi, %rdx), %eax
+ movl (%rsi, %rdx), %ecx
+# else
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %ecx
+# endif
+ /* Check null char. */
+ testl %eax, %eax
+ jne L(cross_page_loop)
+ /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
+ comparisons. */
+ subl %ecx, %eax
+# ifndef USE_AS_WCSCMP
+L(different):
+# endif
+ ret
+
+# ifdef USE_AS_WCSCMP
+ .p2align 4
+L(different):
+ /* Use movl to avoid modifying EFLAGS. */
+ movl $0, %eax
+ setl %al
+ negl %eax
+ orl $1, %eax
+ ret
+# endif
+
+# ifdef USE_AS_STRNCMP
+ .p2align 4
+L(zero):
+ xorl %eax, %eax
+ ret
+
+ .p2align 4
+L(char0):
+# ifdef USE_AS_WCSCMP
+ xorl %eax, %eax
+ movl (%rdi), %ecx
+ cmpl (%rsi), %ecx
+ jne L(wcscmp_return)
+# else
+ movzbl (%rsi), %ecx
+ movzbl (%rdi), %eax
+ subl %ecx, %eax
+# endif
+ ret
+# endif
+
+ .p2align 4
+L(last_vector):
+ addq %rdx, %rdi
+ addq %rdx, %rsi
+# ifdef USE_AS_STRNCMP
+ subq %rdx, %r11
+# endif
+ tzcntl %ecx, %edx
+# ifdef USE_AS_WCSCMP
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ sall $2, %edx
+# endif
+# ifdef USE_AS_STRNCMP
+ cmpq %r11, %rdx
+ jae L(zero)
+# endif
+# ifdef USE_AS_WCSCMP
+ xorl %eax, %eax
+ movl (%rdi, %rdx), %ecx
+ cmpl (%rsi, %rdx), %ecx
+ jne L(wcscmp_return)
+# else
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %edx
+ subl %edx, %eax
+# endif
+ ret
+
+ /* Comparing on page boundary region requires special treatment:
+ It must done one vector at the time, starting with the wider
+ ymm vector if possible, if not, with xmm. If fetching 16 bytes
+ (xmm) still passes the boundary, byte comparison must be done.
+ */
+ .p2align 4
+L(cross_page):
+ /* Try one ymm vector at a time. */
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ jg L(cross_page_1_vector)
+L(loop_1_vector):
+ VMOVU (%rdi, %rdx), %YMM0
+ VMOVU (%rsi, %rdx), %YMM1
+
+ /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
+ VPCMP $4, %YMM0, %YMM1, %k0
+ VPCMP $0, %YMMZERO, %YMM0, %k1
+ VPCMP $0, %YMMZERO, %YMM1, %k2
+ /* Each bit in K1 represents a NULL in YMM0 or YMM1. */
+ kord %k1, %k2, %k1
+ /* Each bit in K1 represents a NULL or a mismatch. */
+ kord %k0, %k1, %k1
+ kmovd %k1, %ecx
+ testl %ecx, %ecx
+ jne L(last_vector)
+
+ addl $VEC_SIZE, %edx
+
+ addl $VEC_SIZE, %eax
+# ifdef USE_AS_STRNCMP
+ /* Return 0 if the current offset (%rdx) >= the maximum offset
+ (%r11). */
+ cmpq %r11, %rdx
+ jae L(zero)
+# endif
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ jle L(loop_1_vector)
+L(cross_page_1_vector):
+ /* Less than 32 bytes to check, try one xmm vector. */
+ cmpl $(PAGE_SIZE - 16), %eax
+ jg L(cross_page_1_xmm)
+ VMOVU (%rdi, %rdx), %XMM0
+ VMOVU (%rsi, %rdx), %XMM1
+
+ /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */
+ VPCMP $4, %XMM0, %XMM1, %k0
+ VPCMP $0, %XMMZERO, %XMM0, %k1
+ VPCMP $0, %XMMZERO, %XMM1, %k2
+ /* Each bit in K1 represents a NULL in XMM0 or XMM1. */
+ korw %k1, %k2, %k1
+ /* Each bit in K1 represents a NULL or a mismatch. */
+ korw %k0, %k1, %k1
+ kmovw %k1, %ecx
+ testl %ecx, %ecx
+ jne L(last_vector)
+
+ addl $16, %edx
+# ifndef USE_AS_WCSCMP
+ addl $16, %eax
+# endif
+# ifdef USE_AS_STRNCMP
+ /* Return 0 if the current offset (%rdx) >= the maximum offset
+ (%r11). */
+ cmpq %r11, %rdx
+ jae L(zero)
+# endif
+
+L(cross_page_1_xmm):
+# ifndef USE_AS_WCSCMP
+ /* Less than 16 bytes to check, try 8 byte vector. NB: No need
+ for wcscmp nor wcsncmp since wide char is 4 bytes. */
+ cmpl $(PAGE_SIZE - 8), %eax
+ jg L(cross_page_8bytes)
+ vmovq (%rdi, %rdx), %XMM0
+ vmovq (%rsi, %rdx), %XMM1
+
+ /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */
+ VPCMP $4, %XMM0, %XMM1, %k0
+ VPCMP $0, %XMMZERO, %XMM0, %k1
+ VPCMP $0, %XMMZERO, %XMM1, %k2
+ /* Each bit in K1 represents a NULL in XMM0 or XMM1. */
+ kord %k1, %k2, %k1
+ /* Each bit in K1 represents a NULL or a mismatch. */
+ kord %k0, %k1, %k1
+ kmovd %k1, %ecx
+
+# ifdef USE_AS_WCSCMP
+ /* Only last 2 bits are valid. */
+ andl $0x3, %ecx
+# else
+ /* Only last 8 bits are valid. */
+ andl $0xff, %ecx
+# endif
+
+ testl %ecx, %ecx
+ jne L(last_vector)
+
+ addl $8, %edx
+ addl $8, %eax
+# ifdef USE_AS_STRNCMP
+ /* Return 0 if the current offset (%rdx) >= the maximum offset
+ (%r11). */
+ cmpq %r11, %rdx
+ jae L(zero)
+# endif
+
+L(cross_page_8bytes):
+ /* Less than 8 bytes to check, try 4 byte vector. */
+ cmpl $(PAGE_SIZE - 4), %eax
+ jg L(cross_page_4bytes)
+ vmovd (%rdi, %rdx), %XMM0
+ vmovd (%rsi, %rdx), %XMM1
+
+ /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */
+ VPCMP $4, %XMM0, %XMM1, %k0
+ VPCMP $0, %XMMZERO, %XMM0, %k1
+ VPCMP $0, %XMMZERO, %XMM1, %k2
+ /* Each bit in K1 represents a NULL in XMM0 or XMM1. */
+ kord %k1, %k2, %k1
+ /* Each bit in K1 represents a NULL or a mismatch. */
+ kord %k0, %k1, %k1
+ kmovd %k1, %ecx
+
+# ifdef USE_AS_WCSCMP
+ /* Only the last bit is valid. */
+ andl $0x1, %ecx
+# else
+ /* Only last 4 bits are valid. */
+ andl $0xf, %ecx
+# endif
+
+ testl %ecx, %ecx
+ jne L(last_vector)
+
+ addl $4, %edx
+# ifdef USE_AS_STRNCMP
+ /* Return 0 if the current offset (%rdx) >= the maximum offset
+ (%r11). */
+ cmpq %r11, %rdx
+ jae L(zero)
+# endif
+
+L(cross_page_4bytes):
+# endif
+ /* Less than 4 bytes to check, try one byte/dword at a time. */
+# ifdef USE_AS_STRNCMP
+ cmpq %r11, %rdx
+ jae L(zero)
+# endif
+# ifdef USE_AS_WCSCMP
+ movl (%rdi, %rdx), %eax
+ movl (%rsi, %rdx), %ecx
+# else
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %ecx
+# endif
+ testl %eax, %eax
+ jne L(cross_page_loop)
+ subl %ecx, %eax
+ ret
+END (STRCMP)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
index ed319704c2..41d670b3e1 100644
--- a/sysdeps/x86_64/multiarch/strcmp.c
+++ b/sysdeps/x86_64/multiarch/strcmp.c
@@ -30,16 +30,29 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
- && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+ if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
- return OPTIMIZE (avx2);
+ {
+ if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable)
+ && CPU_FEATURES_CPU_P (cpu_features, BMI2)
+ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
+ return OPTIMIZE (evex);
+
+ if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ return OPTIMIZE (avx2_rtm);
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ return OPTIMIZE (avx2);
+ }
if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
return OPTIMIZE (sse2_unaligned);
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
new file mode 100644
index 0000000000..c2c581ecf7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRCPY
+# define STRCPY __strcpy_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index e72a0cad7d..19f4d5fd4f 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -37,6 +37,10 @@
# define VZEROUPPER vzeroupper
# endif
+# ifndef SECTION
+# define SECTION(p) p##.avx
+# endif
+
/* zero register */
#define xmmZ xmm0
#define ymmZ ymm0
@@ -46,7 +50,7 @@
# ifndef USE_AS_STRCAT
- .section .text.avx,"ax",@progbits
+ .section SECTION(.text),"ax",@progbits
ENTRY (STRCPY)
# ifdef USE_AS_STRNCPY
mov %RDX_LP, %R8_LP
@@ -369,8 +373,8 @@ L(CopyVecSizeExit):
lea 1(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
# endif
- VZEROUPPER
- ret
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
L(CopyTwoVecSize1):
@@ -553,8 +557,7 @@ L(Exit1):
lea 2(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(Exit2):
@@ -569,8 +572,7 @@ L(Exit2):
lea 3(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(Exit3):
@@ -584,8 +586,7 @@ L(Exit3):
lea 4(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(Exit4_7):
@@ -602,8 +603,7 @@ L(Exit4_7):
lea 1(%rdi, %rdx), %rdi
jnz L(StrncpyFillTailWithZero)
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(Exit8_15):
@@ -620,8 +620,7 @@ L(Exit8_15):
lea 1(%rdi, %rdx), %rdi
jnz L(StrncpyFillTailWithZero)
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(Exit16_31):
@@ -638,8 +637,7 @@ L(Exit16_31):
lea 1(%rdi, %rdx), %rdi
jnz L(StrncpyFillTailWithZero)
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(Exit32_63):
@@ -656,8 +654,7 @@ L(Exit32_63):
lea 1(%rdi, %rdx), %rdi
jnz L(StrncpyFillTailWithZero)
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
# ifdef USE_AS_STRNCPY
@@ -671,8 +668,7 @@ L(StrncpyExit1):
# ifdef USE_AS_STRCAT
movb $0, 1(%rdi)
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(StrncpyExit2):
@@ -684,8 +680,7 @@ L(StrncpyExit2):
# ifdef USE_AS_STRCAT
movb $0, 2(%rdi)
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(StrncpyExit3_4):
@@ -699,8 +694,7 @@ L(StrncpyExit3_4):
# ifdef USE_AS_STRCAT
movb $0, (%rdi, %r8)
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(StrncpyExit5_8):
@@ -714,8 +708,7 @@ L(StrncpyExit5_8):
# ifdef USE_AS_STRCAT
movb $0, (%rdi, %r8)
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(StrncpyExit9_16):
@@ -729,8 +722,7 @@ L(StrncpyExit9_16):
# ifdef USE_AS_STRCAT
movb $0, (%rdi, %r8)
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(StrncpyExit17_32):
@@ -744,8 +736,7 @@ L(StrncpyExit17_32):
# ifdef USE_AS_STRCAT
movb $0, (%rdi, %r8)
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(StrncpyExit33_64):
@@ -760,8 +751,7 @@ L(StrncpyExit33_64):
# ifdef USE_AS_STRCAT
movb $0, (%rdi, %r8)
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(StrncpyExit65):
@@ -778,50 +768,43 @@ L(StrncpyExit65):
# ifdef USE_AS_STRCAT
movb $0, 65(%rdi)
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
# ifndef USE_AS_STRCAT
.p2align 4
L(Fill1):
mov %dl, (%rdi)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(Fill2):
mov %dx, (%rdi)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(Fill3_4):
mov %dx, (%rdi)
mov %dx, -2(%rdi, %r8)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(Fill5_8):
mov %edx, (%rdi)
mov %edx, -4(%rdi, %r8)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(Fill9_16):
mov %rdx, (%rdi)
mov %rdx, -8(%rdi, %r8)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(Fill17_32):
vmovdqu %xmmZ, (%rdi)
vmovdqu %xmmZ, -16(%rdi, %r8)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(CopyVecSizeUnalignedVec2):
@@ -898,8 +881,7 @@ L(Fill):
cmp $1, %r8d
ja L(Fill2)
je L(Fill1)
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
/* end of ifndef USE_AS_STRCAT */
# endif
@@ -929,8 +911,7 @@ L(UnalignedFourVecSizeLeaveCase3):
# ifdef USE_AS_STRCAT
movb $0, (VEC_SIZE * 4)(%rdi)
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(UnalignedFourVecSizeLeaveCase2):
@@ -1001,16 +982,14 @@ L(StrncpyExit):
# ifdef USE_AS_STRCAT
movb $0, (%rdi)
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(ExitZero):
# ifndef USE_AS_STRCAT
mov %rdi, %rax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
# endif
diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
new file mode 100644
index 0000000000..a343a1a692
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
@@ -0,0 +1,1003 @@
+/* strcpy with 256-bit EVEX instructions.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# ifndef USE_AS_STRCAT
+# include <sysdep.h>
+
+# ifndef STRCPY
+# define STRCPY __strcpy_evex
+# endif
+
+# endif
+
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
+
+/* Number of bytes in a vector register */
+# ifndef VEC_SIZE
+# define VEC_SIZE 32
+# endif
+
+# define XMM2 xmm18
+# define XMM3 xmm19
+
+# define YMM2 ymm18
+# define YMM3 ymm19
+# define YMM4 ymm20
+# define YMM5 ymm21
+# define YMM6 ymm22
+# define YMM7 ymm23
+
+# ifndef USE_AS_STRCAT
+
+/* zero register */
+# define XMMZERO xmm16
+# define YMMZERO ymm16
+# define YMM1 ymm17
+
+ .section .text.evex,"ax",@progbits
+ENTRY (STRCPY)
+# ifdef USE_AS_STRNCPY
+ mov %RDX_LP, %R8_LP
+ test %R8_LP, %R8_LP
+ jz L(ExitZero)
+# endif
+ mov %rsi, %rcx
+# ifndef USE_AS_STPCPY
+ mov %rdi, %rax /* save result */
+# endif
+
+ vpxorq %XMMZERO, %XMMZERO, %XMMZERO
+# endif
+
+ and $((VEC_SIZE * 4) - 1), %ecx
+ cmp $(VEC_SIZE * 2), %ecx
+ jbe L(SourceStringAlignmentLessTwoVecSize)
+
+ and $-VEC_SIZE, %rsi
+ and $(VEC_SIZE - 1), %ecx
+
+ vpcmpb $0, (%rsi), %YMMZERO, %k0
+ kmovd %k0, %edx
+ shr %cl, %rdx
+
+# ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+ mov $VEC_SIZE, %r10
+ sub %rcx, %r10
+ cmp %r10, %r8
+# else
+ mov $(VEC_SIZE + 1), %r10
+ sub %rcx, %r10
+ cmp %r10, %r8
+# endif
+ jbe L(CopyVecSizeTailCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyVecSizeTail)
+
+ vpcmpb $0, VEC_SIZE(%rsi), %YMMZERO, %k1
+ kmovd %k1, %edx
+
+# ifdef USE_AS_STRNCPY
+ add $VEC_SIZE, %r10
+ cmp %r10, %r8
+ jbe L(CopyTwoVecSizeCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyTwoVecSize)
+
+ VMOVU (%rsi, %rcx), %YMM2 /* copy VEC_SIZE bytes */
+ VMOVU %YMM2, (%rdi)
+
+/* If source address alignment != destination address alignment */
+ .p2align 4
+L(UnalignVecSizeBoth):
+ sub %rcx, %rdi
+# ifdef USE_AS_STRNCPY
+ add %rcx, %r8
+ sbb %rcx, %rcx
+ or %rcx, %r8
+# endif
+ mov $VEC_SIZE, %rcx
+ VMOVA (%rsi, %rcx), %YMM2
+ VMOVU %YMM2, (%rdi, %rcx)
+ VMOVA VEC_SIZE(%rsi, %rcx), %YMM2
+ vpcmpb $0, %YMM2, %YMMZERO, %k0
+ kmovd %k0, %edx
+ add $VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $(VEC_SIZE * 3), %r8
+ jbe L(CopyVecSizeCase2OrCase3)
+# endif
+ test %edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyVecSizeUnalignedVec2)
+# else
+ jnz L(CopyVecSize)
+# endif
+
+ VMOVU %YMM2, (%rdi, %rcx)
+ VMOVA VEC_SIZE(%rsi, %rcx), %YMM3
+ vpcmpb $0, %YMM3, %YMMZERO, %k0
+ kmovd %k0, %edx
+ add $VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $VEC_SIZE, %r8
+ jbe L(CopyVecSizeCase2OrCase3)
+# endif
+ test %edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyVecSizeUnalignedVec3)
+# else
+ jnz L(CopyVecSize)
+# endif
+
+ VMOVU %YMM3, (%rdi, %rcx)
+ VMOVA VEC_SIZE(%rsi, %rcx), %YMM4
+ vpcmpb $0, %YMM4, %YMMZERO, %k0
+ kmovd %k0, %edx
+ add $VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $VEC_SIZE, %r8
+ jbe L(CopyVecSizeCase2OrCase3)
+# endif
+ test %edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyVecSizeUnalignedVec4)
+# else
+ jnz L(CopyVecSize)
+# endif
+
+ VMOVU %YMM4, (%rdi, %rcx)
+ VMOVA VEC_SIZE(%rsi, %rcx), %YMM2
+ vpcmpb $0, %YMM2, %YMMZERO, %k0
+ kmovd %k0, %edx
+ add $VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $VEC_SIZE, %r8
+ jbe L(CopyVecSizeCase2OrCase3)
+# endif
+ test %edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyVecSizeUnalignedVec2)
+# else
+ jnz L(CopyVecSize)
+# endif
+
+ VMOVU %YMM2, (%rdi, %rcx)
+ VMOVA VEC_SIZE(%rsi, %rcx), %YMM2
+ vpcmpb $0, %YMM2, %YMMZERO, %k0
+ kmovd %k0, %edx
+ add $VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $VEC_SIZE, %r8
+ jbe L(CopyVecSizeCase2OrCase3)
+# endif
+ test %edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyVecSizeUnalignedVec2)
+# else
+ jnz L(CopyVecSize)
+# endif
+
+ VMOVA VEC_SIZE(%rsi, %rcx), %YMM3
+ VMOVU %YMM2, (%rdi, %rcx)
+ vpcmpb $0, %YMM3, %YMMZERO, %k0
+ kmovd %k0, %edx
+ add $VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $VEC_SIZE, %r8
+ jbe L(CopyVecSizeCase2OrCase3)
+# endif
+ test %edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyVecSizeUnalignedVec3)
+# else
+ jnz L(CopyVecSize)
+# endif
+
+ VMOVU %YMM3, (%rdi, %rcx)
+ mov %rsi, %rdx
+ lea VEC_SIZE(%rsi, %rcx), %rsi
+ and $-(VEC_SIZE * 4), %rsi
+ sub %rsi, %rdx
+ sub %rdx, %rdi
+# ifdef USE_AS_STRNCPY
+ lea (VEC_SIZE * 8)(%r8, %rdx), %r8
+# endif
+L(UnalignedFourVecSizeLoop):
+ VMOVA (%rsi), %YMM4
+ VMOVA VEC_SIZE(%rsi), %YMM5
+ VMOVA (VEC_SIZE * 2)(%rsi), %YMM6
+ VMOVA (VEC_SIZE * 3)(%rsi), %YMM7
+ vpminub %YMM5, %YMM4, %YMM2
+ vpminub %YMM7, %YMM6, %YMM3
+ vpminub %YMM2, %YMM3, %YMM2
+ /* If K7 != 0, there is a null byte. */
+ vpcmpb $0, %YMM2, %YMMZERO, %k7
+ kmovd %k7, %edx
+# ifdef USE_AS_STRNCPY
+ sub $(VEC_SIZE * 4), %r8
+ jbe L(UnalignedLeaveCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(UnalignedFourVecSizeLeave)
+
+L(UnalignedFourVecSizeLoop_start):
+ add $(VEC_SIZE * 4), %rdi
+ add $(VEC_SIZE * 4), %rsi
+ VMOVU %YMM4, -(VEC_SIZE * 4)(%rdi)
+ VMOVA (%rsi), %YMM4
+ VMOVU %YMM5, -(VEC_SIZE * 3)(%rdi)
+ VMOVA VEC_SIZE(%rsi), %YMM5
+ vpminub %YMM5, %YMM4, %YMM2
+ VMOVU %YMM6, -(VEC_SIZE * 2)(%rdi)
+ VMOVA (VEC_SIZE * 2)(%rsi), %YMM6
+ VMOVU %YMM7, -VEC_SIZE(%rdi)
+ VMOVA (VEC_SIZE * 3)(%rsi), %YMM7
+ vpminub %YMM7, %YMM6, %YMM3
+ vpminub %YMM2, %YMM3, %YMM2
+ /* If K7 != 0, there is a null byte. */
+ vpcmpb $0, %YMM2, %YMMZERO, %k7
+ kmovd %k7, %edx
+# ifdef USE_AS_STRNCPY
+ sub $(VEC_SIZE * 4), %r8
+ jbe L(UnalignedLeaveCase2OrCase3)
+# endif
+ test %edx, %edx
+ jz L(UnalignedFourVecSizeLoop_start)
+
+L(UnalignedFourVecSizeLeave):
+ vpcmpb $0, %YMM4, %YMMZERO, %k1
+ kmovd %k1, %edx
+ test %edx, %edx
+ jnz L(CopyVecSizeUnaligned_0)
+
+ vpcmpb $0, %YMM5, %YMMZERO, %k2
+ kmovd %k2, %ecx
+ test %ecx, %ecx
+ jnz L(CopyVecSizeUnaligned_16)
+
+ vpcmpb $0, %YMM6, %YMMZERO, %k3
+ kmovd %k3, %edx
+ test %edx, %edx
+ jnz L(CopyVecSizeUnaligned_32)
+
+ vpcmpb $0, %YMM7, %YMMZERO, %k4
+ kmovd %k4, %ecx
+ bsf %ecx, %edx
+ VMOVU %YMM4, (%rdi)
+ VMOVU %YMM5, VEC_SIZE(%rdi)
+ VMOVU %YMM6, (VEC_SIZE * 2)(%rdi)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+ lea (VEC_SIZE * 3)(%rdi, %rdx), %rax
+# endif
+ VMOVU %YMM7, (VEC_SIZE * 3)(%rdi)
+ add $(VEC_SIZE - 1), %r8
+ sub %rdx, %r8
+ lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+# else
+ add $(VEC_SIZE * 3), %rsi
+ add $(VEC_SIZE * 3), %rdi
+ jmp L(CopyVecSizeExit)
+# endif
+
+/* If source address alignment == destination address alignment */
+
+L(SourceStringAlignmentLessTwoVecSize):
+ VMOVU (%rsi), %YMM3
+ VMOVU VEC_SIZE(%rsi), %YMM2
+ vpcmpb $0, %YMM3, %YMMZERO, %k0
+ kmovd %k0, %edx
+
+# ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+ cmp $VEC_SIZE, %r8
+# else
+ cmp $(VEC_SIZE + 1), %r8
+# endif
+ jbe L(CopyVecSizeTail1Case2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyVecSizeTail1)
+
+ VMOVU %YMM3, (%rdi)
+ vpcmpb $0, %YMM2, %YMMZERO, %k0
+ kmovd %k0, %edx
+
+# ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+ cmp $(VEC_SIZE * 2), %r8
+# else
+ cmp $((VEC_SIZE * 2) + 1), %r8
+# endif
+ jbe L(CopyTwoVecSize1Case2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyTwoVecSize1)
+
+ and $-VEC_SIZE, %rsi
+ and $(VEC_SIZE - 1), %ecx
+ jmp L(UnalignVecSizeBoth)
+
+/*------End of main part with loops---------------------*/
+
+/* Case1 */
+
+# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
+ .p2align 4
+L(CopyVecSize):
+ add %rcx, %rdi
+# endif
+L(CopyVecSizeTail):
+ add %rcx, %rsi
+L(CopyVecSizeTail1):
+ bsf %edx, %edx
+L(CopyVecSizeExit):
+ cmp $32, %edx
+ jae L(Exit32_63)
+ cmp $16, %edx
+ jae L(Exit16_31)
+ cmp $8, %edx
+ jae L(Exit8_15)
+ cmp $4, %edx
+ jae L(Exit4_7)
+ cmp $3, %edx
+ je L(Exit3)
+ cmp $1, %edx
+ ja L(Exit2)
+ je L(Exit1)
+ movb $0, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea (%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $1, %r8
+ lea 1(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(CopyTwoVecSize1):
+ add $VEC_SIZE, %rsi
+ add $VEC_SIZE, %rdi
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $VEC_SIZE, %r8
+# endif
+ jmp L(CopyVecSizeTail1)
+
+ .p2align 4
+L(CopyTwoVecSize):
+ bsf %edx, %edx
+ add %rcx, %rsi
+ add $VEC_SIZE, %edx
+ sub %ecx, %edx
+ jmp L(CopyVecSizeExit)
+
+ .p2align 4
+L(CopyVecSizeUnaligned_0):
+ bsf %edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+# endif
+ VMOVU %YMM4, (%rdi)
+ add $((VEC_SIZE * 4) - 1), %r8
+ sub %rdx, %r8
+ lea 1(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+# else
+ jmp L(CopyVecSizeExit)
+# endif
+
+ .p2align 4
+L(CopyVecSizeUnaligned_16):
+ bsf %ecx, %edx
+ VMOVU %YMM4, (%rdi)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+ lea VEC_SIZE(%rdi, %rdx), %rax
+# endif
+ VMOVU %YMM5, VEC_SIZE(%rdi)
+ add $((VEC_SIZE * 3) - 1), %r8
+ sub %rdx, %r8
+ lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+# else
+ add $VEC_SIZE, %rsi
+ add $VEC_SIZE, %rdi
+ jmp L(CopyVecSizeExit)
+# endif
+
+ .p2align 4
+L(CopyVecSizeUnaligned_32):
+ bsf %edx, %edx
+ VMOVU %YMM4, (%rdi)
+ VMOVU %YMM5, VEC_SIZE(%rdi)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+ lea (VEC_SIZE * 2)(%rdi, %rdx), %rax
+# endif
+ VMOVU %YMM6, (VEC_SIZE * 2)(%rdi)
+ add $((VEC_SIZE * 2) - 1), %r8
+ sub %rdx, %r8
+ lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+# else
+ add $(VEC_SIZE * 2), %rsi
+ add $(VEC_SIZE * 2), %rdi
+ jmp L(CopyVecSizeExit)
+# endif
+
+# ifdef USE_AS_STRNCPY
+# ifndef USE_AS_STRCAT
+ .p2align 4
+L(CopyVecSizeUnalignedVec6):
+ VMOVU %YMM6, (%rdi, %rcx)
+ jmp L(CopyVecSizeVecExit)
+
+ .p2align 4
+L(CopyVecSizeUnalignedVec5):
+ VMOVU %YMM5, (%rdi, %rcx)
+ jmp L(CopyVecSizeVecExit)
+
+ .p2align 4
+L(CopyVecSizeUnalignedVec4):
+ VMOVU %YMM4, (%rdi, %rcx)
+ jmp L(CopyVecSizeVecExit)
+
+ .p2align 4
+L(CopyVecSizeUnalignedVec3):
+ VMOVU %YMM3, (%rdi, %rcx)
+ jmp L(CopyVecSizeVecExit)
+# endif
+
+/* Case2 */
+
+ .p2align 4
+L(CopyVecSizeCase2):
+ add $VEC_SIZE, %r8
+ add %rcx, %rdi
+ add %rcx, %rsi
+ bsf %edx, %edx
+ cmp %r8d, %edx
+ jb L(CopyVecSizeExit)
+ jmp L(StrncpyExit)
+
+ .p2align 4
+L(CopyTwoVecSizeCase2):
+ add %rcx, %rsi
+ bsf %edx, %edx
+ add $VEC_SIZE, %edx
+ sub %ecx, %edx
+ cmp %r8d, %edx
+ jb L(CopyVecSizeExit)
+ jmp L(StrncpyExit)
+
+L(CopyVecSizeTailCase2):
+ add %rcx, %rsi
+ bsf %edx, %edx
+ cmp %r8d, %edx
+ jb L(CopyVecSizeExit)
+ jmp L(StrncpyExit)
+
+L(CopyVecSizeTail1Case2):
+ bsf %edx, %edx
+ cmp %r8d, %edx
+ jb L(CopyVecSizeExit)
+ jmp L(StrncpyExit)
+
+/* Case2 or Case3, Case3 */
+
+ .p2align 4
+L(CopyVecSizeCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyVecSizeCase2)
+L(CopyVecSizeCase3):
+ add $VEC_SIZE, %r8
+ add %rcx, %rdi
+ add %rcx, %rsi
+ jmp L(StrncpyExit)
+
+ .p2align 4
+L(CopyTwoVecSizeCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyTwoVecSizeCase2)
+ add %rcx, %rsi
+ jmp L(StrncpyExit)
+
+ .p2align 4
+L(CopyVecSizeTailCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyVecSizeTailCase2)
+ add %rcx, %rsi
+ jmp L(StrncpyExit)
+
+ .p2align 4
+L(CopyTwoVecSize1Case2OrCase3):
+ add $VEC_SIZE, %rdi
+ add $VEC_SIZE, %rsi
+ sub $VEC_SIZE, %r8
+L(CopyVecSizeTail1Case2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyVecSizeTail1Case2)
+ jmp L(StrncpyExit)
+# endif
+
+/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
+
+ .p2align 4
+L(Exit1):
+ movzwl (%rsi), %edx
+ mov %dx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 1(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $2, %r8
+ lea 2(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit2):
+ movzwl (%rsi), %ecx
+ mov %cx, (%rdi)
+ movb $0, 2(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 2(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $3, %r8
+ lea 3(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit3):
+ mov (%rsi), %edx
+ mov %edx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 3(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $4, %r8
+ lea 4(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit4_7):
+ mov (%rsi), %ecx
+ mov %ecx, (%rdi)
+ mov -3(%rsi, %rdx), %ecx
+ mov %ecx, -3(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub %rdx, %r8
+ sub $1, %r8
+ lea 1(%rdi, %rdx), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit8_15):
+ mov (%rsi), %rcx
+ mov -7(%rsi, %rdx), %r9
+ mov %rcx, (%rdi)
+ mov %r9, -7(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub %rdx, %r8
+ sub $1, %r8
+ lea 1(%rdi, %rdx), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit16_31):
+ VMOVU (%rsi), %XMM2
+ VMOVU -15(%rsi, %rdx), %XMM3
+ VMOVU %XMM2, (%rdi)
+ VMOVU %XMM3, -15(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub %rdx, %r8
+ sub $1, %r8
+ lea 1(%rdi, %rdx), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit32_63):
+ VMOVU (%rsi), %YMM2
+ VMOVU -31(%rsi, %rdx), %YMM3
+ VMOVU %YMM2, (%rdi)
+ VMOVU %YMM3, -31(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub %rdx, %r8
+ sub $1, %r8
+ lea 1(%rdi, %rdx), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+# ifdef USE_AS_STRNCPY
+
+ .p2align 4
+L(StrncpyExit1):
+ movzbl (%rsi), %edx
+ mov %dl, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 1(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ movb $0, 1(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit2):
+ movzwl (%rsi), %edx
+ mov %dx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 2(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ movb $0, 2(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit3_4):
+ movzwl (%rsi), %ecx
+ movzwl -2(%rsi, %r8), %edx
+ mov %cx, (%rdi)
+ mov %dx, -2(%rdi, %r8)
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %r8), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ movb $0, (%rdi, %r8)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit5_8):
+ mov (%rsi), %ecx
+ mov -4(%rsi, %r8), %edx
+ mov %ecx, (%rdi)
+ mov %edx, -4(%rdi, %r8)
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %r8), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ movb $0, (%rdi, %r8)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit9_16):
+ mov (%rsi), %rcx
+ mov -8(%rsi, %r8), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, -8(%rdi, %r8)
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %r8), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ movb $0, (%rdi, %r8)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit17_32):
+ VMOVU (%rsi), %XMM2
+ VMOVU -16(%rsi, %r8), %XMM3
+ VMOVU %XMM2, (%rdi)
+ VMOVU %XMM3, -16(%rdi, %r8)
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %r8), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ movb $0, (%rdi, %r8)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit33_64):
+ /* 0/32, 31/16 */
+ VMOVU (%rsi), %YMM2
+ VMOVU -VEC_SIZE(%rsi, %r8), %YMM3
+ VMOVU %YMM2, (%rdi)
+ VMOVU %YMM3, -VEC_SIZE(%rdi, %r8)
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %r8), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ movb $0, (%rdi, %r8)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit65):
+ /* 0/32, 32/32, 64/1 */
+ VMOVU (%rsi), %YMM2
+ VMOVU 32(%rsi), %YMM3
+ mov 64(%rsi), %cl
+ VMOVU %YMM2, (%rdi)
+ VMOVU %YMM3, 32(%rdi)
+ mov %cl, 64(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 65(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ movb $0, 65(%rdi)
+# endif
+ ret
+
+# ifndef USE_AS_STRCAT
+
+ .p2align 4
+L(Fill1):
+ mov %dl, (%rdi)
+ ret
+
+ .p2align 4
+L(Fill2):
+ mov %dx, (%rdi)
+ ret
+
+ .p2align 4
+L(Fill3_4):
+ mov %dx, (%rdi)
+ mov %dx, -2(%rdi, %r8)
+ ret
+
+ .p2align 4
+L(Fill5_8):
+ mov %edx, (%rdi)
+ mov %edx, -4(%rdi, %r8)
+ ret
+
+ .p2align 4
+L(Fill9_16):
+ mov %rdx, (%rdi)
+ mov %rdx, -8(%rdi, %r8)
+ ret
+
+ .p2align 4
+L(Fill17_32):
+ VMOVU %XMMZERO, (%rdi)
+ VMOVU %XMMZERO, -16(%rdi, %r8)
+ ret
+
+ .p2align 4
+L(CopyVecSizeUnalignedVec2):
+ VMOVU %YMM2, (%rdi, %rcx)
+
+ .p2align 4
+L(CopyVecSizeVecExit):
+ bsf %edx, %edx
+ add $(VEC_SIZE - 1), %r8
+ add %rcx, %rdi
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+# endif
+ sub %rdx, %r8
+ lea 1(%rdi, %rdx), %rdi
+
+ .p2align 4
+L(StrncpyFillTailWithZero):
+ xor %edx, %edx
+ sub $VEC_SIZE, %r8
+ jbe L(StrncpyFillExit)
+
+ VMOVU %YMMZERO, (%rdi)
+ add $VEC_SIZE, %rdi
+
+ mov %rdi, %rsi
+ and $(VEC_SIZE - 1), %esi
+ sub %rsi, %rdi
+ add %rsi, %r8
+ sub $(VEC_SIZE * 4), %r8
+ jb L(StrncpyFillLessFourVecSize)
+
+L(StrncpyFillLoopVmovdqa):
+ VMOVA %YMMZERO, (%rdi)
+ VMOVA %YMMZERO, VEC_SIZE(%rdi)
+ VMOVA %YMMZERO, (VEC_SIZE * 2)(%rdi)
+ VMOVA %YMMZERO, (VEC_SIZE * 3)(%rdi)
+ add $(VEC_SIZE * 4), %rdi
+ sub $(VEC_SIZE * 4), %r8
+ jae L(StrncpyFillLoopVmovdqa)
+
+L(StrncpyFillLessFourVecSize):
+ add $(VEC_SIZE * 2), %r8
+ jl L(StrncpyFillLessTwoVecSize)
+ VMOVA %YMMZERO, (%rdi)
+ VMOVA %YMMZERO, VEC_SIZE(%rdi)
+ add $(VEC_SIZE * 2), %rdi
+ sub $VEC_SIZE, %r8
+ jl L(StrncpyFillExit)
+ VMOVA %YMMZERO, (%rdi)
+ add $VEC_SIZE, %rdi
+ jmp L(Fill)
+
+ .p2align 4
+L(StrncpyFillLessTwoVecSize):
+ add $VEC_SIZE, %r8
+ jl L(StrncpyFillExit)
+ VMOVA %YMMZERO, (%rdi)
+ add $VEC_SIZE, %rdi
+ jmp L(Fill)
+
+ .p2align 4
+L(StrncpyFillExit):
+ add $VEC_SIZE, %r8
+L(Fill):
+ cmp $17, %r8d
+ jae L(Fill17_32)
+ cmp $9, %r8d
+ jae L(Fill9_16)
+ cmp $5, %r8d
+ jae L(Fill5_8)
+ cmp $3, %r8d
+ jae L(Fill3_4)
+ cmp $1, %r8d
+ ja L(Fill2)
+ je L(Fill1)
+ ret
+
+/* end of ifndef USE_AS_STRCAT */
+# endif
+
+ .p2align 4
+L(UnalignedLeaveCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(UnalignedFourVecSizeLeaveCase2)
+L(UnalignedFourVecSizeLeaveCase3):
+ lea (VEC_SIZE * 4)(%r8), %rcx
+ and $-VEC_SIZE, %rcx
+ add $(VEC_SIZE * 3), %r8
+ jl L(CopyVecSizeCase3)
+ VMOVU %YMM4, (%rdi)
+ sub $VEC_SIZE, %r8
+ jb L(CopyVecSizeCase3)
+ VMOVU %YMM5, VEC_SIZE(%rdi)
+ sub $VEC_SIZE, %r8
+ jb L(CopyVecSizeCase3)
+ VMOVU %YMM6, (VEC_SIZE * 2)(%rdi)
+ sub $VEC_SIZE, %r8
+ jb L(CopyVecSizeCase3)
+ VMOVU %YMM7, (VEC_SIZE * 3)(%rdi)
+# ifdef USE_AS_STPCPY
+ lea (VEC_SIZE * 4)(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ movb $0, (VEC_SIZE * 4)(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(UnalignedFourVecSizeLeaveCase2):
+ xor %ecx, %ecx
+ vpcmpb $0, %YMM4, %YMMZERO, %k1
+ kmovd %k1, %edx
+ add $(VEC_SIZE * 3), %r8
+ jle L(CopyVecSizeCase2OrCase3)
+ test %edx, %edx
+# ifndef USE_AS_STRCAT
+ jnz L(CopyVecSizeUnalignedVec4)
+# else
+ jnz L(CopyVecSize)
+# endif
+ vpcmpb $0, %YMM5, %YMMZERO, %k2
+ kmovd %k2, %edx
+ VMOVU %YMM4, (%rdi)
+ add $VEC_SIZE, %rcx
+ sub $VEC_SIZE, %r8
+ jbe L(CopyVecSizeCase2OrCase3)
+ test %edx, %edx
+# ifndef USE_AS_STRCAT
+ jnz L(CopyVecSizeUnalignedVec5)
+# else
+ jnz L(CopyVecSize)
+# endif
+
+ vpcmpb $0, %YMM6, %YMMZERO, %k3
+ kmovd %k3, %edx
+ VMOVU %YMM5, VEC_SIZE(%rdi)
+ add $VEC_SIZE, %rcx
+ sub $VEC_SIZE, %r8
+ jbe L(CopyVecSizeCase2OrCase3)
+ test %edx, %edx
+# ifndef USE_AS_STRCAT
+ jnz L(CopyVecSizeUnalignedVec6)
+# else
+ jnz L(CopyVecSize)
+# endif
+
+ vpcmpb $0, %YMM7, %YMMZERO, %k4
+ kmovd %k4, %edx
+ VMOVU %YMM6, (VEC_SIZE * 2)(%rdi)
+ lea VEC_SIZE(%rdi, %rcx), %rdi
+ lea VEC_SIZE(%rsi, %rcx), %rsi
+ bsf %edx, %edx
+ cmp %r8d, %edx
+ jb L(CopyVecSizeExit)
+L(StrncpyExit):
+ cmp $65, %r8d
+ je L(StrncpyExit65)
+ cmp $33, %r8d
+ jae L(StrncpyExit33_64)
+ cmp $17, %r8d
+ jae L(StrncpyExit17_32)
+ cmp $9, %r8d
+ jae L(StrncpyExit9_16)
+ cmp $5, %r8d
+ jae L(StrncpyExit5_8)
+ cmp $3, %r8d
+ jae L(StrncpyExit3_4)
+ cmp $1, %r8d
+ ja L(StrncpyExit2)
+ je L(StrncpyExit1)
+# ifdef USE_AS_STPCPY
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRCAT
+ movb $0, (%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(ExitZero):
+# ifndef USE_AS_STRCAT
+ mov %rdi, %rax
+# endif
+ ret
+
+# endif
+
+# ifndef USE_AS_STRCAT
+END (STRCPY)
+# else
+END (STRCAT)
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S
new file mode 100644
index 0000000000..75b4b7612c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRLEN
+# define STRLEN __strlen_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strlen-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
index 3e7f14a846..f5d058a8d5 100644
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -27,370 +27,475 @@
# ifdef USE_AS_WCSLEN
# define VPCMPEQ vpcmpeqd
# define VPMINU vpminud
+# define CHAR_SIZE 4
# else
# define VPCMPEQ vpcmpeqb
# define VPMINU vpminub
+# define CHAR_SIZE 1
# endif
# ifndef VZEROUPPER
# define VZEROUPPER vzeroupper
# endif
+# ifndef SECTION
+# define SECTION(p) p##.avx
+# endif
+
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
- .section .text.avx,"ax",@progbits
+ .section SECTION(.text),"ax",@progbits
ENTRY (STRLEN)
# ifdef USE_AS_STRNLEN
- /* Check for zero length. */
+ /* Check zero length. */
test %RSI_LP, %RSI_LP
jz L(zero)
+ /* Store max len in R8_LP before adjusting if using WCSLEN. */
+ mov %RSI_LP, %R8_LP
# ifdef USE_AS_WCSLEN
shl $2, %RSI_LP
# elif defined __ILP32__
/* Clear the upper 32 bits. */
movl %esi, %esi
# endif
- mov %RSI_LP, %R8_LP
# endif
- movl %edi, %ecx
+ movl %edi, %eax
movq %rdi, %rdx
vpxor %xmm0, %xmm0, %xmm0
-
+ /* Clear high bits from edi. Only keeping bits relevant to page
+ cross check. */
+ andl $(PAGE_SIZE - 1), %eax
/* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page_boundary)
/* Check the first VEC_SIZE bytes. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
-
+ VPCMPEQ (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
# ifdef USE_AS_STRNLEN
- jnz L(first_vec_x0_check)
- /* Adjust length and check the end of data. */
- subq $VEC_SIZE, %rsi
- jbe L(max)
-# else
- jnz L(first_vec_x0)
+ /* If length < VEC_SIZE handle special. */
+ cmpq $VEC_SIZE, %rsi
+ jbe L(first_vec_x0)
# endif
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
+ /* If empty continue to aligned_more. Otherwise return bit
+ position of first match. */
+ testl %eax, %eax
+ jz L(aligned_more)
+ tzcntl %eax, %eax
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+# endif
+ VZEROUPPER_RETURN
# ifdef USE_AS_STRNLEN
- /* Adjust length. */
- addq %rcx, %rsi
+L(zero):
+ xorl %eax, %eax
+ ret
- subq $(VEC_SIZE * 4), %rsi
- jbe L(last_4x_vec_or_less)
+ .p2align 4
+L(first_vec_x0):
+ /* Set bit for max len so that tzcnt will return min of max len
+ and position of first match. */
+ btsq %rsi, %rax
+ tzcntl %eax, %eax
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+# endif
+ VZEROUPPER_RETURN
# endif
- jmp L(more_4x_vec)
.p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- /* Remove the leading bytes. */
- sarl %cl, %eax
- testl %eax, %eax
- jz L(aligned_more)
+L(first_vec_x1):
tzcntl %eax, %eax
+ /* Safe to use 32 bit instructions as these are only called for
+ size = [1, 159]. */
# ifdef USE_AS_STRNLEN
- /* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
+ /* Use ecx which was computed earlier to compute correct value.
+ */
+ subl $(VEC_SIZE * 4 + 1), %ecx
+ addl %ecx, %eax
+# else
+ subl %edx, %edi
+ incl %edi
+ addl %edi, %eax
# endif
- addq %rdi, %rax
- addq %rcx, %rax
- subq %rdx, %rax
# ifdef USE_AS_WCSLEN
- shrq $2, %rax
+ shrl $2, %eax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
-L(aligned_more):
+L(first_vec_x2):
+ tzcntl %eax, %eax
+ /* Safe to use 32 bit instructions as these are only called for
+ size = [1, 159]. */
# ifdef USE_AS_STRNLEN
- /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE"
- with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
- to void possible addition overflow. */
- negq %rcx
- addq $VEC_SIZE, %rcx
-
- /* Check the end of data. */
- subq %rcx, %rsi
- jbe L(max)
+ /* Use ecx which was computed earlier to compute correct value.
+ */
+ subl $(VEC_SIZE * 3 + 1), %ecx
+ addl %ecx, %eax
+# else
+ subl %edx, %edi
+ addl $(VEC_SIZE + 1), %edi
+ addl %edi, %eax
+# endif
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
# endif
+ VZEROUPPER_RETURN
- addq $VEC_SIZE, %rdi
+ .p2align 4
+L(first_vec_x3):
+ tzcntl %eax, %eax
+ /* Safe to use 32 bit instructions as these are only called for
+ size = [1, 159]. */
+# ifdef USE_AS_STRNLEN
+ /* Use ecx which was computed earlier to compute correct value.
+ */
+ subl $(VEC_SIZE * 2 + 1), %ecx
+ addl %ecx, %eax
+# else
+ subl %edx, %edi
+ addl $(VEC_SIZE * 2 + 1), %edi
+ addl %edi, %eax
+# endif
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+# endif
+ VZEROUPPER_RETURN
+ .p2align 4
+L(first_vec_x4):
+ tzcntl %eax, %eax
+ /* Safe to use 32 bit instructions as these are only called for
+ size = [1, 159]. */
# ifdef USE_AS_STRNLEN
- subq $(VEC_SIZE * 4), %rsi
- jbe L(last_4x_vec_or_less)
+ /* Use ecx which was computed earlier to compute correct value.
+ */
+ subl $(VEC_SIZE + 1), %ecx
+ addl %ecx, %eax
+# else
+ subl %edx, %edi
+ addl $(VEC_SIZE * 3 + 1), %edi
+ addl %edi, %eax
# endif
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+# endif
+ VZEROUPPER_RETURN
-L(more_4x_vec):
+ .p2align 5
+L(aligned_more):
+ /* Align data to VEC_SIZE - 1. This is the same number of
+ instructions as using andq with -VEC_SIZE but saves 4 bytes of
+ code on the x4 check. */
+ orq $(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
since data is only aligned to VEC_SIZE. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
-
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+# ifdef USE_AS_STRNLEN
+ /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
+ it simplies the logic in last_4x_vec_or_less. */
+ leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
+ subq %rdx, %rcx
+# endif
+ /* Load first VEC regardless. */
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
+# ifdef USE_AS_STRNLEN
+ /* Adjust length. If near end handle specially. */
+ subq %rcx, %rsi
+ jb L(last_4x_vec_or_less)
+# endif
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x1)
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x2)
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x3)
- addq $(VEC_SIZE * 4), %rdi
-
-# ifdef USE_AS_STRNLEN
- subq $(VEC_SIZE * 4), %rsi
- jbe L(last_4x_vec_or_less)
-# endif
-
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
- andq $-(4 * VEC_SIZE), %rdi
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x4)
+ /* Align data to VEC_SIZE * 4 - 1. */
# ifdef USE_AS_STRNLEN
- /* Adjust length. */
+ /* Before adjusting length check if at last VEC_SIZE * 4. */
+ cmpq $(VEC_SIZE * 4 - 1), %rsi
+ jbe L(last_4x_vec_or_less_load)
+ incq %rdi
+ movl %edi, %ecx
+ orq $(VEC_SIZE * 4 - 1), %rdi
+ andl $(VEC_SIZE * 4 - 1), %ecx
+ /* Readjust length. */
addq %rcx, %rsi
+# else
+ incq %rdi
+ orq $(VEC_SIZE * 4 - 1), %rdi
# endif
-
+ /* Compare 4 * VEC at a time forward. */
.p2align 4
L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- vmovdqa (%rdi), %ymm1
- vmovdqa VEC_SIZE(%rdi), %ymm2
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
- VPMINU %ymm1, %ymm2, %ymm5
- VPMINU %ymm3, %ymm4, %ymm6
- VPMINU %ymm5, %ymm6, %ymm5
-
- VPCMPEQ %ymm5, %ymm0, %ymm5
- vpmovmskb %ymm5, %eax
- testl %eax, %eax
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
-
-# ifndef USE_AS_STRNLEN
- jmp L(loop_4x_vec)
-# else
+# ifdef USE_AS_STRNLEN
+ /* Break if at end of length. */
subq $(VEC_SIZE * 4), %rsi
- ja L(loop_4x_vec)
+ jb L(last_4x_vec_or_less_cmpeq)
+# endif
+ /* Save some code size by microfusing VPMINU with the load. Since
+ the matches in ymm2/ymm4 can only be returned if there where no
+ matches in ymm1/ymm3 respectively there is no issue with overlap.
+ */
+ vmovdqa 1(%rdi), %ymm1
+ VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
+ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
+ VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
+
+ VPMINU %ymm2, %ymm4, %ymm5
+ VPCMPEQ %ymm5, %ymm0, %ymm5
+ vpmovmskb %ymm5, %ecx
-L(last_4x_vec_or_less):
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
- addl $(VEC_SIZE * 2), %esi
- jle L(last_2x_vec)
+ subq $-(VEC_SIZE * 4), %rdi
+ testl %ecx, %ecx
+ jz L(loop_4x_vec)
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ VPCMPEQ %ymm1, %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ subq %rdx, %rdi
testl %eax, %eax
- jnz L(first_vec_x1)
+ jnz L(last_vec_return_x0)
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ VPCMPEQ %ymm2, %ymm0, %ymm2
+ vpmovmskb %ymm2, %eax
testl %eax, %eax
+ jnz L(last_vec_return_x1)
+
+ /* Combine last 2 VEC. */
+ VPCMPEQ %ymm3, %ymm0, %ymm3
+ vpmovmskb %ymm3, %eax
+ /* rcx has combined result from all 4 VEC. It will only be used if
+ the first 3 other VEC all did not contain a match. */
+ salq $32, %rcx
+ orq %rcx, %rax
+ tzcntq %rax, %rax
+ subq $(VEC_SIZE * 2 - 1), %rdi
+ addq %rdi, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ VZEROUPPER_RETURN
- jnz L(first_vec_x2_check)
- subl $VEC_SIZE, %esi
- jle L(max)
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
+# ifdef USE_AS_STRNLEN
+ .p2align 4
+L(last_4x_vec_or_less_load):
+ /* Depending on entry adjust rdi / prepare first VEC in ymm1. */
+ subq $-(VEC_SIZE * 4), %rdi
+L(last_4x_vec_or_less_cmpeq):
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
+L(last_4x_vec_or_less):
- jnz L(first_vec_x3_check)
- movq %r8, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
-# endif
- VZEROUPPER
- ret
+ vpmovmskb %ymm1, %eax
+ /* If remaining length > VEC_SIZE * 2. This works if esi is off by
+ VEC_SIZE * 4. */
+ testl $(VEC_SIZE * 2), %esi
+ jnz L(last_4x_vec)
- .p2align 4
-L(last_2x_vec):
- addl $(VEC_SIZE * 2), %esi
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ /* length may have been negative or positive by an offset of
+ VEC_SIZE * 4 depending on where this was called from. This fixes
+ that. */
+ andl $(VEC_SIZE * 4 - 1), %esi
testl %eax, %eax
+ jnz L(last_vec_x1_check)
- jnz L(first_vec_x0_check)
subl $VEC_SIZE, %esi
- jle L(max)
-
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x1_check)
- movq %r8, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
-# endif
- VZEROUPPER
- ret
+ jb L(max)
- .p2align 4
-L(first_vec_x0_check):
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
tzcntl %eax, %eax
/* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
+ cmpl %eax, %esi
+ jb L(max)
+ subq %rdx, %rdi
+ addl $(VEC_SIZE + 1), %eax
addq %rdi, %rax
- subq %rdx, %rax
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
+# endif
.p2align 4
-L(first_vec_x1_check):
+L(last_vec_return_x0):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
- addq $VEC_SIZE, %rax
+ subq $(VEC_SIZE * 4 - 1), %rdi
addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
shrq $2, %rax
-# endif
- VZEROUPPER
- ret
+# endif
+ VZEROUPPER_RETURN
.p2align 4
-L(first_vec_x2_check):
+L(last_vec_return_x1):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
- addq $(VEC_SIZE * 2), %rax
+ subq $(VEC_SIZE * 3 - 1), %rdi
addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
shrq $2, %rax
-# endif
- VZEROUPPER
- ret
+# endif
+ VZEROUPPER_RETURN
+# ifdef USE_AS_STRNLEN
.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x1_check):
+
tzcntl %eax, %eax
/* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
- addq $(VEC_SIZE * 3), %rax
+ cmpl %eax, %esi
+ jb L(max)
+ subq %rdx, %rdi
+ incl %eax
addq %rdi, %rax
- subq %rdx, %rax
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
- .p2align 4
L(max):
movq %r8, %rax
+ VZEROUPPER_RETURN
+
+ .p2align 4
+L(last_4x_vec):
+ /* Test first 2x VEC normally. */
+ testl %eax, %eax
+ jnz L(last_vec_x1)
+
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2)
+
+ /* Normalize length. */
+ andl $(VEC_SIZE * 4 - 1), %esi
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x3)
+
+ subl $(VEC_SIZE * 3), %esi
+ jb L(max)
+
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ tzcntl %eax, %eax
+ /* Check the end of data. */
+ cmpl %eax, %esi
+ jb L(max)
+ subq %rdx, %rdi
+ addl $(VEC_SIZE * 3 + 1), %eax
+ addq %rdi, %rax
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
- .p2align 4
-L(zero):
- xorl %eax, %eax
- ret
-# endif
.p2align 4
-L(first_vec_x0):
+L(last_vec_x1):
+ /* essentially duplicates of first_vec_x1 but use 64 bit
+ instructions. */
tzcntl %eax, %eax
+ subq %rdx, %rdi
+ incl %eax
addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
shrq $2, %rax
-# endif
- VZEROUPPER
- ret
+# endif
+ VZEROUPPER_RETURN
.p2align 4
-L(first_vec_x1):
+L(last_vec_x2):
+ /* essentially duplicates of first_vec_x1 but use 64 bit
+ instructions. */
tzcntl %eax, %eax
- addq $VEC_SIZE, %rax
+ subq %rdx, %rdi
+ addl $(VEC_SIZE + 1), %eax
addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
shrq $2, %rax
-# endif
- VZEROUPPER
- ret
+# endif
+ VZEROUPPER_RETURN
.p2align 4
-L(first_vec_x2):
+L(last_vec_x3):
tzcntl %eax, %eax
- addq $(VEC_SIZE * 2), %rax
+ subl $(VEC_SIZE * 2), %esi
+ /* Check the end of data. */
+ cmpl %eax, %esi
+ jb L(max_end)
+ subq %rdx, %rdi
+ addl $(VEC_SIZE * 2 + 1), %eax
addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
shrq $2, %rax
+# endif
+ VZEROUPPER_RETURN
+L(max_end):
+ movq %r8, %rax
+ VZEROUPPER_RETURN
# endif
- VZEROUPPER
- ret
+ /* Cold case for crossing page with first load. */
.p2align 4
-L(4x_vec_end):
- VPCMPEQ %ymm1, %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- VPCMPEQ %ymm2, %ymm0, %ymm2
- vpmovmskb %ymm2, %eax
+L(cross_page_boundary):
+ /* Align data to VEC_SIZE - 1. */
+ orq $(VEC_SIZE - 1), %rdi
+ VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
+ so no need to manually mod rdx. */
+ sarxl %edx, %eax, %eax
+# ifdef USE_AS_STRNLEN
testl %eax, %eax
- jnz L(first_vec_x1)
- VPCMPEQ %ymm3, %ymm0, %ymm3
- vpmovmskb %ymm3, %eax
+ jnz L(cross_page_less_vec)
+ leaq 1(%rdi), %rcx
+ subq %rdx, %rcx
+ /* Check length. */
+ cmpq %rsi, %rcx
+ jb L(cross_page_continue)
+ movq %r8, %rax
+# else
testl %eax, %eax
- jnz L(first_vec_x2)
- VPCMPEQ %ymm4, %ymm0, %ymm4
- vpmovmskb %ymm4, %eax
-L(first_vec_x3):
+ jz L(cross_page_continue)
tzcntl %eax, %eax
- addq $(VEC_SIZE * 3), %rax
- addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+# endif
+# endif
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
+
+# ifdef USE_AS_STRNLEN
+ .p2align 4
+L(cross_page_less_vec):
+ tzcntl %eax, %eax
+ cmpq %rax, %rsi
+ cmovb %esi, %eax
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+# endif
+ VZEROUPPER_RETURN
# endif
- VZEROUPPER
- ret
END (STRLEN)
#endif
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
new file mode 100644
index 0000000000..0583819078
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
@@ -0,0 +1,436 @@
+/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRLEN
+# define STRLEN __strlen_evex
+# endif
+
+# define VMOVA vmovdqa64
+
+# ifdef USE_AS_WCSLEN
+# define VPCMP vpcmpd
+# define VPMINU vpminud
+# define SHIFT_REG r9d
+# else
+# define VPCMP vpcmpb
+# define VPMINU vpminub
+# define SHIFT_REG ecx
+# endif
+
+# define XMMZERO xmm16
+# define YMMZERO ymm16
+# define YMM1 ymm17
+# define YMM2 ymm18
+# define YMM3 ymm19
+# define YMM4 ymm20
+# define YMM5 ymm21
+# define YMM6 ymm22
+
+# define VEC_SIZE 32
+
+ .section .text.evex,"ax",@progbits
+ENTRY (STRLEN)
+# ifdef USE_AS_STRNLEN
+ /* Check for zero length. */
+ test %RSI_LP, %RSI_LP
+ jz L(zero)
+# ifdef USE_AS_WCSLEN
+ shl $2, %RSI_LP
+# elif defined __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %esi, %esi
+# endif
+ mov %RSI_LP, %R8_LP
+# endif
+ movl %edi, %ecx
+ movq %rdi, %rdx
+ vpxorq %XMMZERO, %XMMZERO, %XMMZERO
+
+ /* Check if we may cross page boundary with one vector load. */
+ andl $(2 * VEC_SIZE - 1), %ecx
+ cmpl $VEC_SIZE, %ecx
+ ja L(cros_page_boundary)
+
+ /* Check the first VEC_SIZE bytes. Each bit in K0 represents a
+ null byte. */
+ VPCMP $0, (%rdi), %YMMZERO, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+
+# ifdef USE_AS_STRNLEN
+ jnz L(first_vec_x0_check)
+ /* Adjust length and check the end of data. */
+ subq $VEC_SIZE, %rsi
+ jbe L(max)
+# else
+ jnz L(first_vec_x0)
+# endif
+
+ /* Align data for aligned loads in the loop. */
+ addq $VEC_SIZE, %rdi
+ andl $(VEC_SIZE - 1), %ecx
+ andq $-VEC_SIZE, %rdi
+
+# ifdef USE_AS_STRNLEN
+ /* Adjust length. */
+ addq %rcx, %rsi
+
+ subq $(VEC_SIZE * 4), %rsi
+ jbe L(last_4x_vec_or_less)
+# endif
+ jmp L(more_4x_vec)
+
+ .p2align 4
+L(cros_page_boundary):
+ andl $(VEC_SIZE - 1), %ecx
+ andq $-VEC_SIZE, %rdi
+
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide shift count by 4 since each bit in K0 represent 4
+ bytes. */
+ movl %ecx, %SHIFT_REG
+ sarl $2, %SHIFT_REG
+# endif
+ VPCMP $0, (%rdi), %YMMZERO, %k0
+ kmovd %k0, %eax
+
+ /* Remove the leading bytes. */
+ sarxl %SHIFT_REG, %eax, %eax
+ testl %eax, %eax
+ jz L(aligned_more)
+ tzcntl %eax, %eax
+# ifdef USE_AS_WCSLEN
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ sall $2, %eax
+# endif
+# ifdef USE_AS_STRNLEN
+ /* Check the end of data. */
+ cmpq %rax, %rsi
+ jbe L(max)
+# endif
+ addq %rdi, %rax
+ addq %rcx, %rax
+ subq %rdx, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ ret
+
+ .p2align 4
+L(aligned_more):
+# ifdef USE_AS_STRNLEN
+ /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE"
+ with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
+ to void possible addition overflow. */
+ negq %rcx
+ addq $VEC_SIZE, %rcx
+
+ /* Check the end of data. */
+ subq %rcx, %rsi
+ jbe L(max)
+# endif
+
+ addq $VEC_SIZE, %rdi
+
+# ifdef USE_AS_STRNLEN
+ subq $(VEC_SIZE * 4), %rsi
+ jbe L(last_4x_vec_or_less)
+# endif
+
+L(more_4x_vec):
+ /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
+ since data is only aligned to VEC_SIZE. */
+ VPCMP $0, (%rdi), %YMMZERO, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+
+ VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x2)
+
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x3)
+
+ addq $(VEC_SIZE * 4), %rdi
+
+# ifdef USE_AS_STRNLEN
+ subq $(VEC_SIZE * 4), %rsi
+ jbe L(last_4x_vec_or_less)
+# endif
+
+ /* Align data to 4 * VEC_SIZE. */
+ movq %rdi, %rcx
+ andl $(4 * VEC_SIZE - 1), %ecx
+ andq $-(4 * VEC_SIZE), %rdi
+
+# ifdef USE_AS_STRNLEN
+ /* Adjust length. */
+ addq %rcx, %rsi
+# endif
+
+ .p2align 4
+L(loop_4x_vec):
+ /* Compare 4 * VEC at a time forward. */
+ VMOVA (%rdi), %YMM1
+ VMOVA VEC_SIZE(%rdi), %YMM2
+ VMOVA (VEC_SIZE * 2)(%rdi), %YMM3
+ VMOVA (VEC_SIZE * 3)(%rdi), %YMM4
+
+ VPMINU %YMM1, %YMM2, %YMM5
+ VPMINU %YMM3, %YMM4, %YMM6
+
+ VPMINU %YMM5, %YMM6, %YMM5
+ VPCMP $0, %YMM5, %YMMZERO, %k0
+ ktestd %k0, %k0
+ jnz L(4x_vec_end)
+
+ addq $(VEC_SIZE * 4), %rdi
+
+# ifndef USE_AS_STRNLEN
+ jmp L(loop_4x_vec)
+# else
+ subq $(VEC_SIZE * 4), %rsi
+ ja L(loop_4x_vec)
+
+L(last_4x_vec_or_less):
+ /* Less than 4 * VEC and aligned to VEC_SIZE. */
+ addl $(VEC_SIZE * 2), %esi
+ jle L(last_2x_vec)
+
+ VPCMP $0, (%rdi), %YMMZERO, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+
+ VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x2_check)
+ subl $VEC_SIZE, %esi
+ jle L(max)
+
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x3_check)
+ movq %r8, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ ret
+
+ .p2align 4
+L(last_2x_vec):
+ addl $(VEC_SIZE * 2), %esi
+
+ VPCMP $0, (%rdi), %YMMZERO, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0_check)
+ subl $VEC_SIZE, %esi
+ jle L(max)
+
+ VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1_check)
+ movq %r8, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ ret
+
+ .p2align 4
+L(first_vec_x0_check):
+ tzcntl %eax, %eax
+# ifdef USE_AS_WCSLEN
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ sall $2, %eax
+# endif
+ /* Check the end of data. */
+ cmpq %rax, %rsi
+ jbe L(max)
+ addq %rdi, %rax
+ subq %rdx, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ ret
+
+ .p2align 4
+L(first_vec_x1_check):
+ tzcntl %eax, %eax
+# ifdef USE_AS_WCSLEN
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ sall $2, %eax
+# endif
+ /* Check the end of data. */
+ cmpq %rax, %rsi
+ jbe L(max)
+ addq $VEC_SIZE, %rax
+ addq %rdi, %rax
+ subq %rdx, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ ret
+
+ .p2align 4
+L(first_vec_x2_check):
+ tzcntl %eax, %eax
+# ifdef USE_AS_WCSLEN
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ sall $2, %eax
+# endif
+ /* Check the end of data. */
+ cmpq %rax, %rsi
+ jbe L(max)
+ addq $(VEC_SIZE * 2), %rax
+ addq %rdi, %rax
+ subq %rdx, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ ret
+
+ .p2align 4
+L(first_vec_x3_check):
+ tzcntl %eax, %eax
+# ifdef USE_AS_WCSLEN
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ sall $2, %eax
+# endif
+ /* Check the end of data. */
+ cmpq %rax, %rsi
+ jbe L(max)
+ addq $(VEC_SIZE * 3), %rax
+ addq %rdi, %rax
+ subq %rdx, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ ret
+
+ .p2align 4
+L(max):
+ movq %r8, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ ret
+
+ .p2align 4
+L(zero):
+ xorl %eax, %eax
+ ret
+# endif
+
+ .p2align 4
+L(first_vec_x0):
+ tzcntl %eax, %eax
+# ifdef USE_AS_WCSLEN
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ sall $2, %eax
+# endif
+ addq %rdi, %rax
+ subq %rdx, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ ret
+
+ .p2align 4
+L(first_vec_x1):
+ tzcntl %eax, %eax
+# ifdef USE_AS_WCSLEN
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ sall $2, %eax
+# endif
+ addq $VEC_SIZE, %rax
+ addq %rdi, %rax
+ subq %rdx, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ ret
+
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %eax
+# ifdef USE_AS_WCSLEN
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ sall $2, %eax
+# endif
+ addq $(VEC_SIZE * 2), %rax
+ addq %rdi, %rax
+ subq %rdx, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ ret
+
+ .p2align 4
+L(4x_vec_end):
+ VPCMP $0, %YMM1, %YMMZERO, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+ VPCMP $0, %YMM2, %YMMZERO, %k1
+ kmovd %k1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+ VPCMP $0, %YMM3, %YMMZERO, %k2
+ kmovd %k2, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x2)
+ VPCMP $0, %YMM4, %YMMZERO, %k3
+ kmovd %k3, %eax
+L(first_vec_x3):
+ tzcntl %eax, %eax
+# ifdef USE_AS_WCSLEN
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ sall $2, %eax
+# endif
+ addq $(VEC_SIZE * 3), %rax
+ addq %rdi, %rax
+ subq %rdx, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ ret
+
+END (STRLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S
index 6f5bfc7bde..8d748aa321 100644
--- a/sysdeps/x86_64/multiarch/strlen-sse2.S
+++ b/sysdeps/x86_64/multiarch/strlen-sse2.S
@@ -20,4 +20,4 @@
# define strlen __strlen_sse2
#endif
-#include "../strlen.S"
+#include "strlen-vec.S"
diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
new file mode 100644
index 0000000000..8f660bb9c7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
@@ -0,0 +1,257 @@
+/* SSE2 version of strlen and SSE4.1 version of wcslen.
+ Copyright (C) 2012-2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#ifdef AS_WCSLEN
+# define PMINU pminud
+# define PCMPEQ pcmpeqd
+# define SHIFT_RETURN shrq $2, %rax
+#else
+# define PMINU pminub
+# define PCMPEQ pcmpeqb
+# define SHIFT_RETURN
+#endif
+
+/* Long lived register in strlen(s), strnlen(s, n) are:
+
+ %xmm3 - zero
+ %rdi - s
+ %r10 (s+n) & (~(64-1))
+ %r11 s+n
+*/
+
+
+.text
+ENTRY(strlen)
+
+/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
+#define FIND_ZERO \
+ PCMPEQ (%rax), %xmm0; \
+ PCMPEQ 16(%rax), %xmm1; \
+ PCMPEQ 32(%rax), %xmm2; \
+ PCMPEQ 48(%rax), %xmm3; \
+ pmovmskb %xmm0, %esi; \
+ pmovmskb %xmm1, %edx; \
+ pmovmskb %xmm2, %r8d; \
+ pmovmskb %xmm3, %ecx; \
+ salq $16, %rdx; \
+ salq $16, %rcx; \
+ orq %rsi, %rdx; \
+ orq %r8, %rcx; \
+ salq $32, %rcx; \
+ orq %rcx, %rdx;
+
+#ifdef AS_STRNLEN
+/* Do not read anything when n==0. */
+ test %RSI_LP, %RSI_LP
+ jne L(n_nonzero)
+ xor %rax, %rax
+ ret
+L(n_nonzero):
+# ifdef AS_WCSLEN
+ shl $2, %RSI_LP
+# endif
+
+/* Initialize long lived registers. */
+
+ add %RDI_LP, %RSI_LP
+ mov %RSI_LP, %R10_LP
+ and $-64, %R10_LP
+ mov %RSI_LP, %R11_LP
+#endif
+
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+ movq %rdi, %rax
+ movq %rdi, %rcx
+ andq $4095, %rcx
+/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
+ cmpq $4047, %rcx
+/* We cannot unify this branching as it would be ~6 cycles slower. */
+ ja L(cross_page)
+
+#ifdef AS_STRNLEN
+/* Test if end is among first 64 bytes. */
+# define STRNLEN_PROLOG \
+ mov %r11, %rsi; \
+ subq %rax, %rsi; \
+ andq $-64, %rax; \
+ testq $-64, %rsi; \
+ je L(strnlen_ret)
+#else
+# define STRNLEN_PROLOG andq $-64, %rax;
+#endif
+
+/* Ignore bits in mask that come before start of string. */
+#define PROLOG(lab) \
+ movq %rdi, %rcx; \
+ xorq %rax, %rcx; \
+ STRNLEN_PROLOG; \
+ sarq %cl, %rdx; \
+ test %rdx, %rdx; \
+ je L(lab); \
+ bsfq %rdx, %rax; \
+ SHIFT_RETURN; \
+ ret
+
+#ifdef AS_STRNLEN
+ andq $-16, %rax
+ FIND_ZERO
+#else
+ /* Test first 16 bytes unaligned. */
+ movdqu (%rax), %xmm4
+ PCMPEQ %xmm0, %xmm4
+ pmovmskb %xmm4, %edx
+ test %edx, %edx
+ je L(next48_bytes)
+ bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
+ SHIFT_RETURN
+ ret
+
+L(next48_bytes):
+/* Same as FIND_ZERO except we do not check first 16 bytes. */
+ andq $-16, %rax
+ PCMPEQ 16(%rax), %xmm1
+ PCMPEQ 32(%rax), %xmm2
+ PCMPEQ 48(%rax), %xmm3
+ pmovmskb %xmm1, %edx
+ pmovmskb %xmm2, %r8d
+ pmovmskb %xmm3, %ecx
+ salq $16, %rdx
+ salq $16, %rcx
+ orq %r8, %rcx
+ salq $32, %rcx
+ orq %rcx, %rdx
+#endif
+
+ /* When no zero byte is found xmm1-3 are zero so we do not have to
+ zero them. */
+ PROLOG(loop)
+
+ .p2align 4
+L(cross_page):
+ andq $-64, %rax
+ FIND_ZERO
+ PROLOG(loop_init)
+
+#ifdef AS_STRNLEN
+/* We must do this check to correctly handle strnlen (s, -1). */
+L(strnlen_ret):
+ bts %rsi, %rdx
+ sarq %cl, %rdx
+ test %rdx, %rdx
+ je L(loop_init)
+ bsfq %rdx, %rax
+ SHIFT_RETURN
+ ret
+#endif
+ .p2align 4
+L(loop_init):
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+#ifdef AS_STRNLEN
+ .p2align 4
+L(loop):
+
+ addq $64, %rax
+ cmpq %rax, %r10
+ je L(exit_end)
+
+ movdqa (%rax), %xmm0
+ PMINU 16(%rax), %xmm0
+ PMINU 32(%rax), %xmm0
+ PMINU 48(%rax), %xmm0
+ PCMPEQ %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ testl %edx, %edx
+ jne L(exit)
+ jmp L(loop)
+
+ .p2align 4
+L(exit_end):
+ cmp %rax, %r11
+ je L(first) /* Do not read when end is at page boundary. */
+ pxor %xmm0, %xmm0
+ FIND_ZERO
+
+L(first):
+ bts %r11, %rdx
+ bsfq %rdx, %rdx
+ addq %rdx, %rax
+ subq %rdi, %rax
+ SHIFT_RETURN
+ ret
+
+ .p2align 4
+L(exit):
+ pxor %xmm0, %xmm0
+ FIND_ZERO
+
+ bsfq %rdx, %rdx
+ addq %rdx, %rax
+ subq %rdi, %rax
+ SHIFT_RETURN
+ ret
+
+#else
+
+ /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
+ .p2align 4
+L(loop):
+
+ movdqa 64(%rax), %xmm0
+ PMINU 80(%rax), %xmm0
+ PMINU 96(%rax), %xmm0
+ PMINU 112(%rax), %xmm0
+ PCMPEQ %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ testl %edx, %edx
+ jne L(exit64)
+
+ subq $-128, %rax
+
+ movdqa (%rax), %xmm0
+ PMINU 16(%rax), %xmm0
+ PMINU 32(%rax), %xmm0
+ PMINU 48(%rax), %xmm0
+ PCMPEQ %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ testl %edx, %edx
+ jne L(exit0)
+ jmp L(loop)
+
+ .p2align 4
+L(exit64):
+ addq $64, %rax
+L(exit0):
+ pxor %xmm0, %xmm0
+ FIND_ZERO
+
+ bsfq %rdx, %rdx
+ addq %rdx, %rax
+ subq %rdi, %rax
+ SHIFT_RETURN
+ ret
+
+#endif
+
+END(strlen)
diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
new file mode 100644
index 0000000000..0dcea18dbb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCAT
+#define STRCAT __strncat_avx2_rtm
+#include "strcat-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S
new file mode 100644
index 0000000000..8884f02371
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncat-evex.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCAT
+#define STRCAT __strncat_evex
+#include "strcat-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
new file mode 100644
index 0000000000..37d1224bb9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define STRCMP __strncmp_avx2_rtm
+#define USE_AS_STRNCMP 1
+#include "strcmp-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-evex.S b/sysdeps/x86_64/multiarch/strncmp-evex.S
new file mode 100644
index 0000000000..a1d53e8c9f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncmp-evex.S
@@ -0,0 +1,3 @@
+#define STRCMP __strncmp_evex
+#define USE_AS_STRNCMP 1
+#include "strcmp-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
index e7b29dea4a..52f208ce0d 100644
--- a/sysdeps/x86_64/multiarch/strncmp.c
+++ b/sysdeps/x86_64/multiarch/strncmp.c
@@ -30,16 +30,29 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
- && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+ if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
- return OPTIMIZE (avx2);
+ {
+ if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable)
+ && CPU_FEATURES_CPU_P (cpu_features, BMI2)
+ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
+ return OPTIMIZE (evex);
+
+ if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ return OPTIMIZE (avx2_rtm);
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ return OPTIMIZE (avx2);
+ }
if (CPU_FEATURES_CPU_P (cpu_features, SSE4_2)
&& !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
new file mode 100644
index 0000000000..79e7083299
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_avx2_rtm
+#include "strcpy-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S
new file mode 100644
index 0000000000..40e391f0da
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-evex.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_evex
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S
new file mode 100644
index 0000000000..04f1626a5c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define STRLEN __strnlen_avx2_rtm
+#define USE_AS_STRNLEN 1
+
+#include "strlen-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S
new file mode 100644
index 0000000000..722022f303
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-evex.S
@@ -0,0 +1,4 @@
+#define STRLEN __strnlen_evex
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S
new file mode 100644
index 0000000000..5def14ec1c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRRCHR
+# define STRRCHR __strrchr_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strrchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
index c66df12bff..9116472273 100644
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
@@ -36,9 +36,13 @@
# define VZEROUPPER vzeroupper
# endif
+# ifndef SECTION
+# define SECTION(p) p##.avx
+# endif
+
# define VEC_SIZE 32
- .section .text.avx,"ax",@progbits
+ .section SECTION(.text),"ax",@progbits
ENTRY (STRRCHR)
movd %esi, %xmm4
movl %edi, %ecx
@@ -166,8 +170,8 @@ L(return_value):
# endif
bsrl %eax, %eax
leaq -VEC_SIZE(%rdi, %rax), %rax
- VZEROUPPER
- ret
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
L(match):
@@ -198,8 +202,7 @@ L(find_nul):
jz L(return_value)
bsrl %eax, %eax
leaq -VEC_SIZE(%rdi, %rax), %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(char_and_nul):
@@ -222,14 +225,12 @@ L(char_and_nul_in_first_vec):
jz L(return_null)
bsrl %eax, %eax
leaq -VEC_SIZE(%rdi, %rax), %rax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
.p2align 4
L(return_null):
xorl %eax, %eax
- VZEROUPPER
- ret
+ VZEROUPPER_RETURN
END (STRRCHR)
#endif
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
new file mode 100644
index 0000000000..f920b5a584
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
@@ -0,0 +1,265 @@
+/* strrchr/wcsrchr optimized with 256-bit EVEX instructions.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRRCHR
+# define STRRCHR __strrchr_evex
+# endif
+
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
+
+# ifdef USE_AS_WCSRCHR
+# define VPBROADCAST vpbroadcastd
+# define VPCMP vpcmpd
+# define SHIFT_REG r8d
+# else
+# define VPBROADCAST vpbroadcastb
+# define VPCMP vpcmpb
+# define SHIFT_REG ecx
+# endif
+
+# define XMMZERO xmm16
+# define YMMZERO ymm16
+# define YMMMATCH ymm17
+# define YMM1 ymm18
+
+# define VEC_SIZE 32
+
+ .section .text.evex,"ax",@progbits
+ENTRY (STRRCHR)
+ movl %edi, %ecx
+ /* Broadcast CHAR to YMMMATCH. */
+ VPBROADCAST %esi, %YMMMATCH
+
+ vpxorq %XMMZERO, %XMMZERO, %XMMZERO
+
+ /* Check if we may cross page boundary with one vector load. */
+ andl $(2 * VEC_SIZE - 1), %ecx
+ cmpl $VEC_SIZE, %ecx
+ ja L(cros_page_boundary)
+
+ VMOVU (%rdi), %YMM1
+
+ /* Each bit in K0 represents a null byte in YMM1. */
+ VPCMP $0, %YMMZERO, %YMM1, %k0
+ /* Each bit in K1 represents a CHAR in YMM1. */
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
+ kmovd %k0, %ecx
+ kmovd %k1, %eax
+
+ addq $VEC_SIZE, %rdi
+
+ testl %eax, %eax
+ jnz L(first_vec)
+
+ testl %ecx, %ecx
+ jnz L(return_null)
+
+ andq $-VEC_SIZE, %rdi
+ xorl %edx, %edx
+ jmp L(aligned_loop)
+
+ .p2align 4
+L(first_vec):
+ /* Check if there is a null byte. */
+ testl %ecx, %ecx
+ jnz L(char_and_nul_in_first_vec)
+
+ /* Remember the match and keep searching. */
+ movl %eax, %edx
+ movq %rdi, %rsi
+ andq $-VEC_SIZE, %rdi
+ jmp L(aligned_loop)
+
+ .p2align 4
+L(cros_page_boundary):
+ andl $(VEC_SIZE - 1), %ecx
+ andq $-VEC_SIZE, %rdi
+
+# ifdef USE_AS_WCSRCHR
+ /* NB: Divide shift count by 4 since each bit in K1 represent 4
+ bytes. */
+ movl %ecx, %SHIFT_REG
+ sarl $2, %SHIFT_REG
+# endif
+
+ VMOVA (%rdi), %YMM1
+
+ /* Each bit in K0 represents a null byte in YMM1. */
+ VPCMP $0, %YMMZERO, %YMM1, %k0
+ /* Each bit in K1 represents a CHAR in YMM1. */
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
+ kmovd %k0, %edx
+ kmovd %k1, %eax
+
+ shrxl %SHIFT_REG, %edx, %edx
+ shrxl %SHIFT_REG, %eax, %eax
+ addq $VEC_SIZE, %rdi
+
+ /* Check if there is a CHAR. */
+ testl %eax, %eax
+ jnz L(found_char)
+
+ testl %edx, %edx
+ jnz L(return_null)
+
+ jmp L(aligned_loop)
+
+ .p2align 4
+L(found_char):
+ testl %edx, %edx
+ jnz L(char_and_nul)
+
+ /* Remember the match and keep searching. */
+ movl %eax, %edx
+ leaq (%rdi, %rcx), %rsi
+
+ .p2align 4
+L(aligned_loop):
+ VMOVA (%rdi), %YMM1
+ addq $VEC_SIZE, %rdi
+
+ /* Each bit in K0 represents a null byte in YMM1. */
+ VPCMP $0, %YMMZERO, %YMM1, %k0
+ /* Each bit in K1 represents a CHAR in YMM1. */
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
+ kmovd %k0, %ecx
+ kmovd %k1, %eax
+ orl %eax, %ecx
+ jnz L(char_nor_null)
+
+ VMOVA (%rdi), %YMM1
+ add $VEC_SIZE, %rdi
+
+ /* Each bit in K0 represents a null byte in YMM1. */
+ VPCMP $0, %YMMZERO, %YMM1, %k0
+ /* Each bit in K1 represents a CHAR in YMM1. */
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
+ kmovd %k0, %ecx
+ kmovd %k1, %eax
+ orl %eax, %ecx
+ jnz L(char_nor_null)
+
+ VMOVA (%rdi), %YMM1
+ addq $VEC_SIZE, %rdi
+
+ /* Each bit in K0 represents a null byte in YMM1. */
+ VPCMP $0, %YMMZERO, %YMM1, %k0
+ /* Each bit in K1 represents a CHAR in YMM1. */
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
+ kmovd %k0, %ecx
+ kmovd %k1, %eax
+ orl %eax, %ecx
+ jnz L(char_nor_null)
+
+ VMOVA (%rdi), %YMM1
+ addq $VEC_SIZE, %rdi
+
+ /* Each bit in K0 represents a null byte in YMM1. */
+ VPCMP $0, %YMMZERO, %YMM1, %k0
+ /* Each bit in K1 represents a CHAR in YMM1. */
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
+ kmovd %k0, %ecx
+ kmovd %k1, %eax
+ orl %eax, %ecx
+ jz L(aligned_loop)
+
+ .p2align 4
+L(char_nor_null):
+ /* Find a CHAR or a null byte in a loop. */
+ testl %eax, %eax
+ jnz L(match)
+L(return_value):
+ testl %edx, %edx
+ jz L(return_null)
+ movl %edx, %eax
+ movq %rsi, %rdi
+ bsrl %eax, %eax
+# ifdef USE_AS_WCSRCHR
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ leaq -VEC_SIZE(%rdi, %rax, 4), %rax
+# else
+ leaq -VEC_SIZE(%rdi, %rax), %rax
+# endif
+ ret
+
+ .p2align 4
+L(match):
+ /* Find a CHAR. Check if there is a null byte. */
+ kmovd %k0, %ecx
+ testl %ecx, %ecx
+ jnz L(find_nul)
+
+ /* Remember the match and keep searching. */
+ movl %eax, %edx
+ movq %rdi, %rsi
+ jmp L(aligned_loop)
+
+ .p2align 4
+L(find_nul):
+ /* Mask out any matching bits after the null byte. */
+ movl %ecx, %r8d
+ subl $1, %r8d
+ xorl %ecx, %r8d
+ andl %r8d, %eax
+ testl %eax, %eax
+ /* If there is no CHAR here, return the remembered one. */
+ jz L(return_value)
+ bsrl %eax, %eax
+# ifdef USE_AS_WCSRCHR
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ leaq -VEC_SIZE(%rdi, %rax, 4), %rax
+# else
+ leaq -VEC_SIZE(%rdi, %rax), %rax
+# endif
+ ret
+
+ .p2align 4
+L(char_and_nul):
+ /* Find both a CHAR and a null byte. */
+ addq %rcx, %rdi
+ movl %edx, %ecx
+L(char_and_nul_in_first_vec):
+ /* Mask out any matching bits after the null byte. */
+ movl %ecx, %r8d
+ subl $1, %r8d
+ xorl %ecx, %r8d
+ andl %r8d, %eax
+ testl %eax, %eax
+ /* Return null pointer if the null byte comes first. */
+ jz L(return_null)
+ bsrl %eax, %eax
+# ifdef USE_AS_WCSRCHR
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
+ leaq -VEC_SIZE(%rdi, %rax, 4), %rax
+# else
+ leaq -VEC_SIZE(%rdi, %rax), %rax
+# endif
+ ret
+
+ .p2align 4
+L(return_null):
+ xorl %eax, %eax
+ ret
+
+END (STRRCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S
new file mode 100644
index 0000000000..d49dbbf0b4
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define STRCHR __wcschr_avx2_rtm
+#define USE_AS_WCSCHR 1
+#include "strchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcschr-evex.S b/sysdeps/x86_64/multiarch/wcschr-evex.S
new file mode 100644
index 0000000000..7cb8f1e41a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcschr-evex.S
@@ -0,0 +1,3 @@
+#define STRCHR __wcschr_evex
+#define USE_AS_WCSCHR 1
+#include "strchr-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S
new file mode 100644
index 0000000000..d6ca2b8064
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define STRCMP __wcscmp_avx2_rtm
+#define USE_AS_WCSCMP 1
+
+#include "strcmp-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcscmp-evex.S b/sysdeps/x86_64/multiarch/wcscmp-evex.S
new file mode 100644
index 0000000000..42e73e51eb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscmp-evex.S
@@ -0,0 +1,4 @@
+#define STRCMP __wcscmp_evex
+#define USE_AS_WCSCMP 1
+
+#include "strcmp-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S
new file mode 100644
index 0000000000..35658d7365
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define STRLEN __wcslen_avx2_rtm
+#define USE_AS_WCSLEN 1
+
+#include "strlen-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-evex.S b/sysdeps/x86_64/multiarch/wcslen-evex.S
new file mode 100644
index 0000000000..bdafa83bd5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-evex.S
@@ -0,0 +1,4 @@
+#define STRLEN __wcslen_evex
+#define USE_AS_WCSLEN 1
+
+#include "strlen-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
new file mode 100644
index 0000000000..7e62621afc
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
@@ -0,0 +1,4 @@
+#define AS_WCSLEN
+#define strlen __wcslen_sse4_1
+
+#include "strlen-vec.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen.c b/sysdeps/x86_64/multiarch/wcslen.c
index c23ce457d2..13070fdcf4 100644
--- a/sysdeps/x86_64/multiarch/wcslen.c
+++ b/sysdeps/x86_64/multiarch/wcslen.c
@@ -24,7 +24,7 @@
# undef __wcslen
# define SYMBOL_NAME wcslen
-# include "ifunc-avx2.h"
+# include "ifunc-wcslen.h"
libc_ifunc_redirected (__redirect_wcslen, __wcslen, IFUNC_SELECTOR ());
weak_alias (__wcslen, wcslen);
diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
new file mode 100644
index 0000000000..4e88c70cc6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
@@ -0,0 +1,5 @@
+#define STRCMP __wcsncmp_avx2_rtm
+#define USE_AS_STRNCMP 1
+#define USE_AS_WCSCMP 1
+
+#include "strcmp-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncmp-evex.S b/sysdeps/x86_64/multiarch/wcsncmp-evex.S
new file mode 100644
index 0000000000..8a8e310713
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncmp-evex.S
@@ -0,0 +1,5 @@
+#define STRCMP __wcsncmp_evex
+#define USE_AS_STRNCMP 1
+#define USE_AS_WCSCMP 1
+
+#include "strcmp-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S
new file mode 100644
index 0000000000..7437ebee2d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S
@@ -0,0 +1,5 @@
+#define STRLEN __wcsnlen_avx2_rtm
+#define USE_AS_WCSLEN 1
+#define USE_AS_STRNLEN 1
+
+#include "strlen-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex.S b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
new file mode 100644
index 0000000000..24773bb4e2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
@@ -0,0 +1,5 @@
+#define STRLEN __wcsnlen_evex
+#define USE_AS_WCSLEN 1
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
index a8cab0cb00..5fa51fe07c 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
+++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
@@ -2,4 +2,4 @@
#define AS_STRNLEN
#define strlen __wcsnlen_sse4_1
-#include "../strlen.S"
+#include "strlen-vec.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
index 3da11970c8..f1b6bc87b4 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen.c
+++ b/sysdeps/x86_64/multiarch/wcsnlen.c
@@ -24,27 +24,7 @@
# undef __wcsnlen
# define SYMBOL_NAME wcsnlen
-# include <init-arch.h>
-
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
-
-static inline void *
-IFUNC_SELECTOR (void)
-{
- const struct cpu_features* cpu_features = __get_cpu_features ();
-
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
- && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
- && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
- return OPTIMIZE (avx2);
-
- if (CPU_FEATURES_CPU_P (cpu_features, SSE4_1))
- return OPTIMIZE (sse4_1);
-
- return OPTIMIZE (sse2);
-}
+# include "ifunc-wcslen.h"
libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ());
weak_alias (__wcsnlen, wcsnlen);
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S
new file mode 100644
index 0000000000..9bf760833f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define STRRCHR __wcsrchr_avx2_rtm
+#define USE_AS_WCSRCHR 1
+#include "strrchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex.S b/sysdeps/x86_64/multiarch/wcsrchr-evex.S
new file mode 100644
index 0000000000..c64602f7dc
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsrchr-evex.S
@@ -0,0 +1,3 @@
+#define STRRCHR __wcsrchr_evex
+#define USE_AS_WCSRCHR 1
+#include "strrchr-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S
new file mode 100644
index 0000000000..58ed21db01
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define MEMCHR __wmemchr_avx2_rtm
+#define USE_AS_WMEMCHR 1
+
+#include "memchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex.S b/sysdeps/x86_64/multiarch/wmemchr-evex.S
new file mode 100644
index 0000000000..06cd0f9f5a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemchr-evex.S
@@ -0,0 +1,4 @@
+#define MEMCHR __wmemchr_evex
+#define USE_AS_WMEMCHR 1
+
+#include "memchr-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S
new file mode 100644
index 0000000000..31104d1215
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S
@@ -0,0 +1,4 @@
+#define MEMCMP __wmemcmp_avx2_movbe_rtm
+#define USE_AS_WMEMCMP 1
+
+#include "memcmp-avx2-movbe-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
new file mode 100644
index 0000000000..4726d74aa1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
@@ -0,0 +1,4 @@
+#define MEMCMP __wmemcmp_evex_movbe
+#define USE_AS_WMEMCMP 1
+
+#include "memcmp-evex-movbe.S"
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index 9ab357fc1a..ad047d8402 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -1,5 +1,5 @@
-/* SSE2 version of strlen/wcslen.
- Copyright (C) 2012-2019 Free Software Foundation, Inc.
+/* SSE2 version of strlen.
+ Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,243 +16,6 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#include <sysdep.h>
+#include "multiarch/strlen-vec.S"
-#ifdef AS_WCSLEN
-# define PMINU pminud
-# define PCMPEQ pcmpeqd
-# define SHIFT_RETURN shrq $2, %rax
-#else
-# define PMINU pminub
-# define PCMPEQ pcmpeqb
-# define SHIFT_RETURN
-#endif
-
-/* Long lived register in strlen(s), strnlen(s, n) are:
-
- %xmm3 - zero
- %rdi - s
- %r10 (s+n) & (~(64-1))
- %r11 s+n
-*/
-
-
-.text
-ENTRY(strlen)
-
-/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
-#define FIND_ZERO \
- PCMPEQ (%rax), %xmm0; \
- PCMPEQ 16(%rax), %xmm1; \
- PCMPEQ 32(%rax), %xmm2; \
- PCMPEQ 48(%rax), %xmm3; \
- pmovmskb %xmm0, %esi; \
- pmovmskb %xmm1, %edx; \
- pmovmskb %xmm2, %r8d; \
- pmovmskb %xmm3, %ecx; \
- salq $16, %rdx; \
- salq $16, %rcx; \
- orq %rsi, %rdx; \
- orq %r8, %rcx; \
- salq $32, %rcx; \
- orq %rcx, %rdx;
-
-#ifdef AS_STRNLEN
-/* Do not read anything when n==0. */
- test %RSI_LP, %RSI_LP
- jne L(n_nonzero)
- xor %rax, %rax
- ret
-L(n_nonzero):
-# ifdef AS_WCSLEN
- shl $2, %RSI_LP
-# endif
-
-/* Initialize long lived registers. */
-
- add %RDI_LP, %RSI_LP
- mov %RSI_LP, %R10_LP
- and $-64, %R10_LP
- mov %RSI_LP, %R11_LP
-#endif
-
- pxor %xmm0, %xmm0
- pxor %xmm1, %xmm1
- pxor %xmm2, %xmm2
- pxor %xmm3, %xmm3
- movq %rdi, %rax
- movq %rdi, %rcx
- andq $4095, %rcx
-/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
- cmpq $4047, %rcx
-/* We cannot unify this branching as it would be ~6 cycles slower. */
- ja L(cross_page)
-
-#ifdef AS_STRNLEN
-/* Test if end is among first 64 bytes. */
-# define STRNLEN_PROLOG \
- mov %r11, %rsi; \
- subq %rax, %rsi; \
- andq $-64, %rax; \
- testq $-64, %rsi; \
- je L(strnlen_ret)
-#else
-# define STRNLEN_PROLOG andq $-64, %rax;
-#endif
-
-/* Ignore bits in mask that come before start of string. */
-#define PROLOG(lab) \
- movq %rdi, %rcx; \
- xorq %rax, %rcx; \
- STRNLEN_PROLOG; \
- sarq %cl, %rdx; \
- test %rdx, %rdx; \
- je L(lab); \
- bsfq %rdx, %rax; \
- SHIFT_RETURN; \
- ret
-
-#ifdef AS_STRNLEN
- andq $-16, %rax
- FIND_ZERO
-#else
- /* Test first 16 bytes unaligned. */
- movdqu (%rax), %xmm4
- PCMPEQ %xmm0, %xmm4
- pmovmskb %xmm4, %edx
- test %edx, %edx
- je L(next48_bytes)
- bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
- SHIFT_RETURN
- ret
-
-L(next48_bytes):
-/* Same as FIND_ZERO except we do not check first 16 bytes. */
- andq $-16, %rax
- PCMPEQ 16(%rax), %xmm1
- PCMPEQ 32(%rax), %xmm2
- PCMPEQ 48(%rax), %xmm3
- pmovmskb %xmm1, %edx
- pmovmskb %xmm2, %r8d
- pmovmskb %xmm3, %ecx
- salq $16, %rdx
- salq $16, %rcx
- orq %r8, %rcx
- salq $32, %rcx
- orq %rcx, %rdx
-#endif
-
- /* When no zero byte is found xmm1-3 are zero so we do not have to
- zero them. */
- PROLOG(loop)
-
- .p2align 4
-L(cross_page):
- andq $-64, %rax
- FIND_ZERO
- PROLOG(loop_init)
-
-#ifdef AS_STRNLEN
-/* We must do this check to correctly handle strnlen (s, -1). */
-L(strnlen_ret):
- bts %rsi, %rdx
- sarq %cl, %rdx
- test %rdx, %rdx
- je L(loop_init)
- bsfq %rdx, %rax
- SHIFT_RETURN
- ret
-#endif
- .p2align 4
-L(loop_init):
- pxor %xmm1, %xmm1
- pxor %xmm2, %xmm2
- pxor %xmm3, %xmm3
-#ifdef AS_STRNLEN
- .p2align 4
-L(loop):
-
- addq $64, %rax
- cmpq %rax, %r10
- je L(exit_end)
-
- movdqa (%rax), %xmm0
- PMINU 16(%rax), %xmm0
- PMINU 32(%rax), %xmm0
- PMINU 48(%rax), %xmm0
- PCMPEQ %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- testl %edx, %edx
- jne L(exit)
- jmp L(loop)
-
- .p2align 4
-L(exit_end):
- cmp %rax, %r11
- je L(first) /* Do not read when end is at page boundary. */
- pxor %xmm0, %xmm0
- FIND_ZERO
-
-L(first):
- bts %r11, %rdx
- bsfq %rdx, %rdx
- addq %rdx, %rax
- subq %rdi, %rax
- SHIFT_RETURN
- ret
-
- .p2align 4
-L(exit):
- pxor %xmm0, %xmm0
- FIND_ZERO
-
- bsfq %rdx, %rdx
- addq %rdx, %rax
- subq %rdi, %rax
- SHIFT_RETURN
- ret
-
-#else
-
- /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
- .p2align 4
-L(loop):
-
- movdqa 64(%rax), %xmm0
- PMINU 80(%rax), %xmm0
- PMINU 96(%rax), %xmm0
- PMINU 112(%rax), %xmm0
- PCMPEQ %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- testl %edx, %edx
- jne L(exit64)
-
- subq $-128, %rax
-
- movdqa (%rax), %xmm0
- PMINU 16(%rax), %xmm0
- PMINU 32(%rax), %xmm0
- PMINU 48(%rax), %xmm0
- PCMPEQ %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- testl %edx, %edx
- jne L(exit0)
- jmp L(loop)
-
- .p2align 4
-L(exit64):
- addq $64, %rax
-L(exit0):
- pxor %xmm0, %xmm0
- FIND_ZERO
-
- bsfq %rdx, %rdx
- addq %rdx, %rax
- subq %rdi, %rax
- SHIFT_RETURN
- ret
-
-#endif
-
-END(strlen)
libc_hidden_builtin_def (strlen)
diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
index 7b64be935b..7f5cd1b7ed 100644
--- a/sysdeps/x86_64/sysdep.h
+++ b/sysdeps/x86_64/sysdep.h
@@ -95,6 +95,28 @@ lose: \
#define R14_LP r14
#define R15_LP r15
+/* Zero upper vector registers and return with xtest. NB: Use VZEROALL
+ to avoid RTM abort triggered by VZEROUPPER inside transactionally. */
+#define ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST \
+ xtest; \
+ jz 1f; \
+ vzeroall; \
+ ret; \
+1: \
+ vzeroupper; \
+ ret
+
+/* Zero upper vector registers and return. */
+#ifndef ZERO_UPPER_VEC_REGISTERS_RETURN
+# define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ VZEROUPPER; \
+ ret
+#endif
+
+#ifndef VZEROUPPER_RETURN
+# define VZEROUPPER_RETURN VZEROUPPER; ret
+#endif
+
#else /* __ASSEMBLER__ */
/* Long and pointer size in bytes. */