aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS3
-rw-r--r--elf/Makefile2
-rw-r--r--elf/ifuncmain6pie.c14
-rw-r--r--elf/ifuncmod6.c8
-rw-r--r--iconvdata/Makefile4
-rw-r--r--iconvdata/bug-iconv14.c127
-rw-r--r--iconvdata/iso-2022-jp-3.c67
-rw-r--r--sysdeps/aarch64/dl-machine.h12
-rw-r--r--sysdeps/aarch64/memcpy.S211
-rw-r--r--sysdeps/aarch64/multiarch/Makefile2
-rw-r--r--sysdeps/aarch64/multiarch/ifunc-impl-list.c2
-rw-r--r--sysdeps/aarch64/multiarch/memcpy.c8
-rw-r--r--sysdeps/aarch64/multiarch/memcpy_advsimd.S248
-rw-r--r--sysdeps/aarch64/multiarch/memmove.c6
-rw-r--r--sysdeps/aarch64/sysdep.h2
-rw-r--r--sysdeps/i386/dl-machine.h16
-rw-r--r--sysdeps/unix/sysv/linux/aarch64/cpu-features.h8
-rw-r--r--sysdeps/x86/Makefile6
-rw-r--r--sysdeps/x86/dl-cet.c6
-rw-r--r--sysdeps/x86/tst-setjmp-cet.c1
-rw-r--r--sysdeps/x86_64/dl-machine.h16
-rw-r--r--sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S21
22 files changed, 632 insertions, 158 deletions
diff --git a/NEWS b/NEWS
index 1884f741e6..1668896930 100644
--- a/NEWS
+++ b/NEWS
@@ -35,6 +35,7 @@ The following bugs are resolved with this release:
[18035] Fix pldd hang
[19444] build failures with -O1 due to -Wmaybe-uninitialized
[20018] getaddrinfo should reject IP addresses with trailing characters
+ [20019] NULL pointer dereference in libc.so.6 IFUNC due to uninitialized GOT
[20209] localedata: Spelling mistake for Sunday in Greenlandic kl_GL
[20568] Fix crash in _IO_wfile_sync
[22927] libanl: properly cleanup if first helper thread creation failed
@@ -76,6 +77,8 @@ The following bugs are resolved with this release:
[25414] 'glob' use-after-free bug (CVE-2020-1752)
[25423] Array overflow in backtrace on powerpc
[25933] Off by one error in __strncmp_avx2
+ [27130] "rep movsb" performance issue
+ [27177] GLIBC_TUNABLES=glibc.cpu.x86_ibt=on:glibc.cpu.x86_shstk=on doesn't work
Security related changes:
diff --git a/elf/Makefile b/elf/Makefile
index 6027926bd1..564b814b48 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -1267,6 +1267,8 @@ CFLAGS-ifuncmain5pie.c += $(pie-ccflag)
CFLAGS-ifuncmain6pie.c += $(pie-ccflag)
CFLAGS-ifuncmain7pie.c += $(pie-ccflag)
+LDFLAGS-ifuncmain6pie = -Wl,-z,lazy
+
$(objpfx)ifuncmain1pie: $(objpfx)ifuncmod1.so
$(objpfx)ifuncmain1staticpie: $(objpfx)ifuncdep1pic.o
$(objpfx)ifuncmain1vispie: $(objpfx)ifuncmod1.so
diff --git a/elf/ifuncmain6pie.c b/elf/ifuncmain6pie.c
index 04faeb86ef..4a01906836 100644
--- a/elf/ifuncmain6pie.c
+++ b/elf/ifuncmain6pie.c
@@ -9,7 +9,6 @@
#include "ifunc-sel.h"
typedef int (*foo_p) (void);
-extern foo_p foo_ptr;
static int
one (void)
@@ -28,20 +27,17 @@ foo_ifunc (void)
}
extern int foo (void);
-extern foo_p get_foo (void);
+extern int call_foo (void);
extern foo_p get_foo_p (void);
-foo_p my_foo_ptr = foo;
+foo_p foo_ptr = foo;
int
main (void)
{
foo_p p;
- p = get_foo ();
- if (p != foo)
- abort ();
- if ((*p) () != -30)
+ if (call_foo () != -30)
abort ();
p = get_foo_p ();
@@ -52,12 +48,8 @@ main (void)
if (foo_ptr != foo)
abort ();
- if (my_foo_ptr != foo)
- abort ();
if ((*foo_ptr) () != -30)
abort ();
- if ((*my_foo_ptr) () != -30)
- abort ();
if (foo () != -30)
abort ();
diff --git a/elf/ifuncmod6.c b/elf/ifuncmod6.c
index 2e16c1d06d..2f6d0715e6 100644
--- a/elf/ifuncmod6.c
+++ b/elf/ifuncmod6.c
@@ -4,7 +4,7 @@ extern int foo (void);
typedef int (*foo_p) (void);
-foo_p foo_ptr = foo;
+extern foo_p foo_ptr;
foo_p
get_foo_p (void)
@@ -12,8 +12,8 @@ get_foo_p (void)
return foo_ptr;
}
-foo_p
-get_foo (void)
+int
+call_foo (void)
{
- return foo;
+ return foo ();
}
diff --git a/iconvdata/Makefile b/iconvdata/Makefile
index 06e161d9b8..36dd5d12c3 100644
--- a/iconvdata/Makefile
+++ b/iconvdata/Makefile
@@ -73,7 +73,7 @@ modules.so := $(addsuffix .so, $(modules))
ifeq (yes,$(build-shared))
tests = bug-iconv1 bug-iconv2 tst-loading tst-e2big tst-iconv4 bug-iconv4 \
tst-iconv6 bug-iconv5 bug-iconv6 tst-iconv7 bug-iconv8 bug-iconv9 \
- bug-iconv10 bug-iconv11 bug-iconv12
+ bug-iconv10 bug-iconv11 bug-iconv12 bug-iconv14
ifeq ($(have-thread-library),yes)
tests += bug-iconv3
endif
@@ -316,6 +316,8 @@ $(objpfx)bug-iconv10.out: $(objpfx)gconv-modules \
$(addprefix $(objpfx),$(modules.so))
$(objpfx)bug-iconv12.out: $(objpfx)gconv-modules \
$(addprefix $(objpfx),$(modules.so))
+$(objpfx)bug-iconv14.out: $(objpfx)gconv-modules \
+ $(addprefix $(objpfx),$(modules.so))
$(objpfx)iconv-test.out: run-iconv-test.sh $(objpfx)gconv-modules \
$(addprefix $(objpfx),$(modules.so)) \
diff --git a/iconvdata/bug-iconv14.c b/iconvdata/bug-iconv14.c
new file mode 100644
index 0000000000..902f140fa9
--- /dev/null
+++ b/iconvdata/bug-iconv14.c
@@ -0,0 +1,127 @@
+/* Assertion in ISO-2022-JP-3 due to two-character sequence (bug 27256).
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <iconv.h>
+#include <string.h>
+#include <errno.h>
+#include <support/check.h>
+
+/* Use an escape sequence to return to the initial state. */
+static void
+with_escape_sequence (void)
+{
+ iconv_t c = iconv_open ("UTF-8", "ISO-2022-JP-3");
+ TEST_VERIFY_EXIT (c != (iconv_t) -1);
+
+ char in[] = "\e$(O+D\e(B";
+ char *inbuf = in;
+ size_t inleft = strlen (in);
+ char out[3]; /* Space for one output character. */
+ char *outbuf;
+ size_t outleft;
+
+ outbuf = out;
+ outleft = sizeof (out);
+ TEST_COMPARE (iconv (c, &inbuf, &inleft, &outbuf, &outleft), (size_t) -1);
+ TEST_COMPARE (errno, E2BIG);
+ TEST_COMPARE (inleft, 3);
+ TEST_COMPARE (inbuf - in, strlen (in) - 3);
+ TEST_COMPARE (outleft, sizeof (out) - 2);
+ TEST_COMPARE (outbuf - out, 2);
+ TEST_COMPARE (out[0] & 0xff, 0xc3);
+ TEST_COMPARE (out[1] & 0xff, 0xa6);
+
+ /* Return to the initial shift state, producing the pending
+ character. */
+ outbuf = out;
+ outleft = sizeof (out);
+ TEST_COMPARE (iconv (c, &inbuf, &inleft, &outbuf, &outleft), 0);
+ TEST_COMPARE (inleft, 0);
+ TEST_COMPARE (inbuf - in, strlen (in));
+ TEST_COMPARE (outleft, sizeof (out) - 2);
+ TEST_COMPARE (outbuf - out, 2);
+ TEST_COMPARE (out[0] & 0xff, 0xcc);
+ TEST_COMPARE (out[1] & 0xff, 0x80);
+
+ /* Nothing should be flushed the second time. */
+ outbuf = out;
+ outleft = sizeof (out);
+ TEST_COMPARE (iconv (c, NULL, 0, &outbuf, &outleft), 0);
+ TEST_COMPARE (outleft, sizeof (out));
+ TEST_COMPARE (outbuf - out, 0);
+ TEST_COMPARE (out[0] & 0xff, 0xcc);
+ TEST_COMPARE (out[1] & 0xff, 0x80);
+
+ TEST_COMPARE (iconv_close (c), 0);
+}
+
+/* Use an explicit flush to return to the initial state. */
+static void
+with_flush (void)
+{
+ iconv_t c = iconv_open ("UTF-8", "ISO-2022-JP-3");
+ TEST_VERIFY_EXIT (c != (iconv_t) -1);
+
+ char in[] = "\e$(O+D";
+ char *inbuf = in;
+ size_t inleft = strlen (in);
+ char out[3]; /* Space for one output character. */
+ char *outbuf;
+ size_t outleft;
+
+ outbuf = out;
+ outleft = sizeof (out);
+ TEST_COMPARE (iconv (c, &inbuf, &inleft, &outbuf, &outleft), (size_t) -1);
+ TEST_COMPARE (errno, E2BIG);
+ TEST_COMPARE (inleft, 0);
+ TEST_COMPARE (inbuf - in, strlen (in));
+ TEST_COMPARE (outleft, sizeof (out) - 2);
+ TEST_COMPARE (outbuf - out, 2);
+ TEST_COMPARE (out[0] & 0xff, 0xc3);
+ TEST_COMPARE (out[1] & 0xff, 0xa6);
+
+ /* Flush the pending character. */
+ outbuf = out;
+ outleft = sizeof (out);
+ TEST_COMPARE (iconv (c, NULL, 0, &outbuf, &outleft), 0);
+ TEST_COMPARE (outleft, sizeof (out) - 2);
+ TEST_COMPARE (outbuf - out, 2);
+ TEST_COMPARE (out[0] & 0xff, 0xcc);
+ TEST_COMPARE (out[1] & 0xff, 0x80);
+
+ /* Nothing should be flushed the second time. */
+ outbuf = out;
+ outleft = sizeof (out);
+ TEST_COMPARE (iconv (c, NULL, 0, &outbuf, &outleft), 0);
+ TEST_COMPARE (outleft, sizeof (out));
+ TEST_COMPARE (outbuf - out, 0);
+ TEST_COMPARE (out[0] & 0xff, 0xcc);
+ TEST_COMPARE (out[1] & 0xff, 0x80);
+
+ TEST_COMPARE (iconv_close (c), 0);
+}
+
+static int
+do_test (void)
+{
+ with_escape_sequence ();
+ with_flush ();
+ return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/iconvdata/iso-2022-jp-3.c b/iconvdata/iso-2022-jp-3.c
index de259580c3..047fab8e8d 100644
--- a/iconvdata/iso-2022-jp-3.c
+++ b/iconvdata/iso-2022-jp-3.c
@@ -67,23 +67,34 @@ enum
CURRENT_SEL_MASK = 7 << 3
};
-/* During UCS-4 to ISO-2022-JP-3 conversion, the COUNT element of the state
- also contains the last two bytes to be output, shifted by 6 bits, and a
- one-bit indicator whether they must be preceded by the shift sequence,
- in bit 22. */
+/* During UCS-4 to ISO-2022-JP-3 conversion, the COUNT element of the
+ state also contains the last two bytes to be output, shifted by 6
+ bits, and a one-bit indicator whether they must be preceded by the
+ shift sequence, in bit 22. During ISO-2022-JP-3 to UCS-4
+ conversion, COUNT may also contain a non-zero pending wide
+ character, shifted by six bits. This happens for certain inputs in
+ JISX0213_1_2004_set and JISX0213_2_set if the second wide character
+ in a combining sequence cannot be written because the buffer is
+ full. */
/* Since this is a stateful encoding we have to provide code which resets
the output state to the initial state. This has to be done during the
flushing. */
#define EMIT_SHIFT_TO_INIT \
- if ((data->__statep->__count & ~7) != ASCII_set) \
+ if (data->__statep->__count != ASCII_set) \
{ \
if (FROM_DIRECTION) \
{ \
- /* It's easy, we don't have to emit anything, we just reset the \
- state for the input. */ \
- data->__statep->__count &= 7; \
- data->__statep->__count |= ASCII_set; \
+ if (__glibc_likely (outbuf + 4 <= outend)) \
+ { \
+ /* Write out the last character. */ \
+ *((uint32_t *) outbuf) = data->__statep->__count >> 6; \
+ outbuf += sizeof (uint32_t); \
+ data->__statep->__count = ASCII_set; \
+ } \
+ else \
+ /* We don't have enough room in the output buffer. */ \
+ status = __GCONV_FULL_OUTPUT; \
} \
else \
{ \
@@ -151,7 +162,21 @@ enum
#define LOOPFCT FROM_LOOP
#define BODY \
{ \
- uint32_t ch = *inptr; \
+ uint32_t ch; \
+ \
+ /* Output any pending character. */ \
+ ch = set >> 6; \
+ if (__glibc_unlikely (ch != 0)) \
+ { \
+ put32 (outptr, ch); \
+ outptr += 4; \
+ /* Remove the pending character, but preserve state bits. */ \
+ set &= (1 << 6) - 1; \
+ continue; \
+ } \
+ \
+ /* Otherwise read the next input byte. */ \
+ ch = *inptr; \
\
/* Recognize escape sequences. */ \
if (__glibc_unlikely (ch == ESC)) \
@@ -297,21 +322,25 @@ enum
uint32_t u1 = __jisx0213_to_ucs_combining[ch - 1][0]; \
uint32_t u2 = __jisx0213_to_ucs_combining[ch - 1][1]; \
\
+ inptr += 2; \
+ \
+ put32 (outptr, u1); \
+ outptr += 4; \
+ \
/* See whether we have room for two characters. */ \
- if (outptr + 8 <= outend) \
+ if (outptr + 4 <= outend) \
{ \
- inptr += 2; \
- put32 (outptr, u1); \
- outptr += 4; \
put32 (outptr, u2); \
outptr += 4; \
continue; \
} \
- else \
- { \
- result = __GCONV_FULL_OUTPUT; \
- break; \
- } \
+ \
+ /* Otherwise store only the first character now, and \
+ put the second one into the queue. */ \
+ set |= u2 << 6; \
+ /* Tell the caller why we terminate the loop. */ \
+ result = __GCONV_FULL_OUTPUT; \
+ break; \
} \
\
inptr += 2; \
diff --git a/sysdeps/aarch64/dl-machine.h b/sysdeps/aarch64/dl-machine.h
index 9617cb754f..c7ae423417 100644
--- a/sysdeps/aarch64/dl-machine.h
+++ b/sysdeps/aarch64/dl-machine.h
@@ -388,13 +388,6 @@ elf_machine_lazy_rel (struct link_map *map,
/* Check for unexpected PLT reloc type. */
if (__builtin_expect (r_type == AARCH64_R(JUMP_SLOT), 1))
{
- if (map->l_mach.plt == 0)
- {
- /* Prelinking. */
- *reloc_addr += l_addr;
- return;
- }
-
if (1) /* DT_AARCH64_VARIANT_PCS is not available, so always check. */
{
/* Check the symbol table for variant PCS symbols. */
@@ -418,7 +411,10 @@ elf_machine_lazy_rel (struct link_map *map,
}
}
- *reloc_addr = map->l_mach.plt;
+ if (map->l_mach.plt == 0)
+ *reloc_addr += l_addr;
+ else
+ *reloc_addr = map->l_mach.plt;
}
else if (__builtin_expect (r_type == AARCH64_R(TLSDESC), 1))
{
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 7e1163e6a0..919b28d7e1 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -33,32 +33,24 @@
#define A_l x6
#define A_lw w6
#define A_h x7
-#define A_hw w7
#define B_l x8
#define B_lw w8
#define B_h x9
#define C_l x10
+#define C_lw w10
#define C_h x11
#define D_l x12
#define D_h x13
-#define E_l src
-#define E_h count
-#define F_l srcend
-#define F_h dst
+#define E_l x14
+#define E_h x15
+#define F_l x16
+#define F_h x17
#define G_l count
#define G_h dst
+#define H_l src
+#define H_h srcend
#define tmp1 x14
-/* Copies are split into 3 main cases: small copies of up to 16 bytes,
- medium copies of 17..96 bytes which are fully unrolled. Large copies
- of more than 96 bytes align the destination and use an unrolled loop
- processing 64 bytes per iteration.
- In order to share code with memmove, small and medium copies read all
- data before writing, allowing any kind of overlap. So small, medium
- and large backwards memmoves are handled by falling through into memcpy.
- Overlapping large forward memmoves use a loop that copies backwards.
-*/
-
#ifndef MEMMOVE
# define MEMMOVE memmove
#endif
@@ -66,108 +58,115 @@
# define MEMCPY memcpy
#endif
-ENTRY_ALIGN (MEMMOVE, 6)
-
- DELOUSE (0)
- DELOUSE (1)
- DELOUSE (2)
+/* This implementation supports both memcpy and memmove and shares most code.
+ It uses unaligned accesses and branchless sequences to keep the code small,
+ simple and improve performance.
- sub tmp1, dstin, src
- cmp count, 96
- ccmp tmp1, count, 2, hi
- b.lo L(move_long)
+ Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+ copies of up to 128 bytes, and large copies. The overhead of the overlap
+ check in memmove is negligible since it is only required for large copies.
- /* Common case falls through into memcpy. */
-END (MEMMOVE)
-libc_hidden_builtin_def (MEMMOVE)
-ENTRY (MEMCPY)
+ Large copies use a software pipelined loop processing 64 bytes per
+ iteration. The destination pointer is 16-byte aligned to minimize
+ unaligned accesses. The loop tail is handled by always copying 64 bytes
+ from the end.
+*/
+ENTRY_ALIGN (MEMCPY, 6)
DELOUSE (0)
DELOUSE (1)
DELOUSE (2)
- prfm PLDL1KEEP, [src]
add srcend, src, count
add dstend, dstin, count
- cmp count, 16
- b.ls L(copy16)
- cmp count, 96
+ cmp count, 128
b.hi L(copy_long)
+ cmp count, 32
+ b.hi L(copy32_128)
- /* Medium copies: 17..96 bytes. */
- sub tmp1, count, 1
+ /* Small copies: 0..32 bytes. */
+ cmp count, 16
+ b.lo L(copy16)
ldp A_l, A_h, [src]
- tbnz tmp1, 6, L(copy96)
ldp D_l, D_h, [srcend, -16]
- tbz tmp1, 5, 1f
- ldp B_l, B_h, [src, 16]
- ldp C_l, C_h, [srcend, -32]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstend, -32]
-1:
stp A_l, A_h, [dstin]
stp D_l, D_h, [dstend, -16]
ret
- .p2align 4
- /* Small copies: 0..16 bytes. */
+ /* Copy 8-15 bytes. */
L(copy16):
- cmp count, 8
- b.lo 1f
+ tbz count, 3, L(copy8)
ldr A_l, [src]
ldr A_h, [srcend, -8]
str A_l, [dstin]
str A_h, [dstend, -8]
ret
- .p2align 4
-1:
- tbz count, 2, 1f
+
+ .p2align 3
+ /* Copy 4-7 bytes. */
+L(copy8):
+ tbz count, 2, L(copy4)
ldr A_lw, [src]
- ldr A_hw, [srcend, -4]
+ ldr B_lw, [srcend, -4]
str A_lw, [dstin]
- str A_hw, [dstend, -4]
+ str B_lw, [dstend, -4]
ret
- /* Copy 0..3 bytes. Use a branchless sequence that copies the same
- byte 3 times if count==1, or the 2nd byte twice if count==2. */
-1:
- cbz count, 2f
+ /* Copy 0..3 bytes using a branchless sequence. */
+L(copy4):
+ cbz count, L(copy0)
lsr tmp1, count, 1
ldrb A_lw, [src]
- ldrb A_hw, [srcend, -1]
+ ldrb C_lw, [srcend, -1]
ldrb B_lw, [src, tmp1]
strb A_lw, [dstin]
strb B_lw, [dstin, tmp1]
- strb A_hw, [dstend, -1]
-2: ret
+ strb C_lw, [dstend, -1]
+L(copy0):
+ ret
.p2align 4
- /* Copy 64..96 bytes. Copy 64 bytes from the start and
- 32 bytes from the end. */
-L(copy96):
+ /* Medium copies: 33..128 bytes. */
+L(copy32_128):
+ ldp A_l, A_h, [src]
ldp B_l, B_h, [src, 16]
- ldp C_l, C_h, [src, 32]
- ldp D_l, D_h, [src, 48]
- ldp E_l, E_h, [srcend, -32]
- ldp F_l, F_h, [srcend, -16]
+ ldp C_l, C_h, [srcend, -32]
+ ldp D_l, D_h, [srcend, -16]
+ cmp count, 64
+ b.hi L(copy128)
stp A_l, A_h, [dstin]
stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstin, 32]
- stp D_l, D_h, [dstin, 48]
- stp E_l, E_h, [dstend, -32]
- stp F_l, F_h, [dstend, -16]
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
ret
- /* Align DST to 16 byte alignment so that we don't cross cache line
- boundaries on both loads and stores. There are at least 96 bytes
- to copy, so copy 16 bytes unaligned and then align. The loop
- copies 64 bytes per iteration and prefetches one iteration ahead. */
+ .p2align 4
+ /* Copy 65..128 bytes. */
+L(copy128):
+ ldp E_l, E_h, [src, 32]
+ ldp F_l, F_h, [src, 48]
+ cmp count, 96
+ b.ls L(copy96)
+ ldp G_l, G_h, [srcend, -64]
+ ldp H_l, H_h, [srcend, -48]
+ stp G_l, G_h, [dstend, -64]
+ stp H_l, H_h, [dstend, -48]
+L(copy96):
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp E_l, E_h, [dstin, 32]
+ stp F_l, F_h, [dstin, 48]
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
+ ret
.p2align 4
+ /* Copy more than 128 bytes. */
L(copy_long):
+ /* Copy 16 bytes and then align dst to 16-byte alignment. */
+ ldp D_l, D_h, [src]
and tmp1, dstin, 15
bic dst, dstin, 15
- ldp D_l, D_h, [src]
sub src, src, tmp1
add count, count, tmp1 /* Count is now 16 too large. */
ldp A_l, A_h, [src, 16]
@@ -176,7 +175,8 @@ L(copy_long):
ldp C_l, C_h, [src, 48]
ldp D_l, D_h, [src, 64]!
subs count, count, 128 + 16 /* Test and readjust count. */
- b.ls L(last64)
+ b.ls L(copy64_from_end)
+
L(loop64):
stp A_l, A_h, [dst, 16]
ldp A_l, A_h, [src, 16]
@@ -189,10 +189,8 @@ L(loop64):
subs count, count, 64
b.hi L(loop64)
- /* Write the last full set of 64 bytes. The remainder is at most 64
- bytes, so it is safe to always copy 64 bytes from the end even if
- there is just 1 byte left. */
-L(last64):
+ /* Write the last iteration and copy 64 bytes from the end. */
+L(copy64_from_end):
ldp E_l, E_h, [srcend, -64]
stp A_l, A_h, [dst, 16]
ldp A_l, A_h, [srcend, -48]
@@ -207,20 +205,42 @@ L(last64):
stp C_l, C_h, [dstend, -16]
ret
- .p2align 4
-L(move_long):
- cbz tmp1, 3f
+END (MEMCPY)
+libc_hidden_builtin_def (MEMCPY)
+
+ENTRY_ALIGN (MEMMOVE, 4)
+ DELOUSE (0)
+ DELOUSE (1)
+ DELOUSE (2)
add srcend, src, count
add dstend, dstin, count
+ cmp count, 128
+ b.hi L(move_long)
+ cmp count, 32
+ b.hi L(copy32_128)
+
+ /* Small copies: 0..32 bytes. */
+ cmp count, 16
+ b.lo L(copy16)
+ ldp A_l, A_h, [src]
+ ldp D_l, D_h, [srcend, -16]
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret
- /* Align dstend to 16 byte alignment so that we don't cross cache line
- boundaries on both loads and stores. There are at least 96 bytes
- to copy, so copy 16 bytes unaligned and then align. The loop
- copies 64 bytes per iteration and prefetches one iteration ahead. */
+ .p2align 4
+L(move_long):
+ /* Only use backward copy if there is an overlap. */
+ sub tmp1, dstin, src
+ cbz tmp1, L(copy0)
+ cmp tmp1, count
+ b.hs L(copy_long)
- and tmp1, dstend, 15
+ /* Large backwards copy for overlapping copies.
+ Copy 16 bytes and then align dst to 16-byte alignment. */
ldp D_l, D_h, [srcend, -16]
+ and tmp1, dstend, 15
sub srcend, srcend, tmp1
sub count, count, tmp1
ldp A_l, A_h, [srcend, -16]
@@ -230,10 +250,9 @@ L(move_long):
ldp D_l, D_h, [srcend, -64]!
sub dstend, dstend, tmp1
subs count, count, 128
- b.ls 2f
+ b.ls L(copy64_from_start)
- nop
-1:
+L(loop64_backwards):
stp A_l, A_h, [dstend, -16]
ldp A_l, A_h, [srcend, -16]
stp B_l, B_h, [dstend, -32]
@@ -243,12 +262,10 @@ L(move_long):
stp D_l, D_h, [dstend, -64]!
ldp D_l, D_h, [srcend, -64]!
subs count, count, 64
- b.hi 1b
+ b.hi L(loop64_backwards)
- /* Write the last full set of 64 bytes. The remainder is at most 64
- bytes, so it is safe to always copy 64 bytes from the start even if
- there is just 1 byte left. */
-2:
+ /* Write the last iteration and copy 64 bytes from the start. */
+L(copy64_from_start):
ldp G_l, G_h, [src, 48]
stp A_l, A_h, [dstend, -16]
ldp A_l, A_h, [src, 32]
@@ -261,7 +278,7 @@ L(move_long):
stp A_l, A_h, [dstin, 32]
stp B_l, B_h, [dstin, 16]
stp C_l, C_h, [dstin]
-3: ret
+ ret
-END (MEMCPY)
-libc_hidden_builtin_def (MEMCPY)
+END (MEMMOVE)
+libc_hidden_builtin_def (MEMMOVE)
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 57ffdf7238..134fe463fd 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,4 +1,4 @@
ifeq ($(subdir),string)
-sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
+sysdep_routines += memcpy_generic memcpy_advsimd memcpy_thunderx memcpy_thunderx2 \
memcpy_falkor memmove_falkor memset_generic memset_falkor
endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index e55be80103..0ccd14118c 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -42,10 +42,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
IFUNC_IMPL (i, name, memmove,
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
IFUNC_IMPL (i, name, memset,
/* Enable this on non-falkor processors too so that other cores
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
index 8f5d4e7df5..e69a1ae5af 100644
--- a/sysdeps/aarch64/multiarch/memcpy.c
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -29,6 +29,7 @@
extern __typeof (__redirect_memcpy) __libc_memcpy;
extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
@@ -36,11 +37,14 @@ extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
libc_ifunc (__libc_memcpy,
(IS_THUNDERX (midr)
? __memcpy_thunderx
- : (IS_FALKOR (midr) || IS_PHECDA (midr) || IS_ARES (midr)
+ : (IS_FALKOR (midr) || IS_PHECDA (midr)
? __memcpy_falkor
: (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
? __memcpy_thunderx2
- : __memcpy_generic))));
+ : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)
+ || IS_NEOVERSE_V1 (midr)
+ ? __memcpy_simd
+ : __memcpy_generic)))));
# undef memcpy
strong_alias (__libc_memcpy, memcpy);
diff --git a/sysdeps/aarch64/multiarch/memcpy_advsimd.S b/sysdeps/aarch64/multiarch/memcpy_advsimd.S
new file mode 100644
index 0000000000..48bb6d7ca4
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy_advsimd.S
@@ -0,0 +1,248 @@
+/* Generic optimized memcpy using SIMD.
+ Copyright (C) 2020 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ *
+ */
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define A_l x6
+#define A_lw w6
+#define A_h x7
+#define B_l x8
+#define B_lw w8
+#define B_h x9
+#define C_lw w10
+#define tmp1 x14
+
+#define A_q q0
+#define B_q q1
+#define C_q q2
+#define D_q q3
+#define E_q q4
+#define F_q q5
+#define G_q q6
+#define H_q q7
+
+
+/* This implementation supports both memcpy and memmove and shares most code.
+ It uses unaligned accesses and branchless sequences to keep the code small,
+ simple and improve performance.
+
+ Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+ copies of up to 128 bytes, and large copies. The overhead of the overlap
+ check in memmove is negligible since it is only required for large copies.
+
+ Large copies use a software pipelined loop processing 64 bytes per
+ iteration. The destination pointer is 16-byte aligned to minimize
+ unaligned accesses. The loop tail is handled by always copying 64 bytes
+ from the end. */
+
+ENTRY (__memcpy_simd)
+ DELOUSE (0)
+ DELOUSE (1)
+ DELOUSE (2)
+
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 128
+ b.hi L(copy_long)
+ cmp count, 32
+ b.hi L(copy32_128)
+
+ /* Small copies: 0..32 bytes. */
+ cmp count, 16
+ b.lo L(copy16)
+ ldr A_q, [src]
+ ldr B_q, [srcend, -16]
+ str A_q, [dstin]
+ str B_q, [dstend, -16]
+ ret
+
+ /* Copy 8-15 bytes. */
+L(copy16):
+ tbz count, 3, L(copy8)
+ ldr A_l, [src]
+ ldr A_h, [srcend, -8]
+ str A_l, [dstin]
+ str A_h, [dstend, -8]
+ ret
+
+ /* Copy 4-7 bytes. */
+L(copy8):
+ tbz count, 2, L(copy4)
+ ldr A_lw, [src]
+ ldr B_lw, [srcend, -4]
+ str A_lw, [dstin]
+ str B_lw, [dstend, -4]
+ ret
+
+ /* Copy 0..3 bytes using a branchless sequence. */
+L(copy4):
+ cbz count, L(copy0)
+ lsr tmp1, count, 1
+ ldrb A_lw, [src]
+ ldrb C_lw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb C_lw, [dstend, -1]
+L(copy0):
+ ret
+
+ .p2align 4
+ /* Medium copies: 33..128 bytes. */
+L(copy32_128):
+ ldp A_q, B_q, [src]
+ ldp C_q, D_q, [srcend, -32]
+ cmp count, 64
+ b.hi L(copy128)
+ stp A_q, B_q, [dstin]
+ stp C_q, D_q, [dstend, -32]
+ ret
+
+ .p2align 4
+ /* Copy 65..128 bytes. */
+L(copy128):
+ ldp E_q, F_q, [src, 32]
+ cmp count, 96
+ b.ls L(copy96)
+ ldp G_q, H_q, [srcend, -64]
+ stp G_q, H_q, [dstend, -64]
+L(copy96):
+ stp A_q, B_q, [dstin]
+ stp E_q, F_q, [dstin, 32]
+ stp C_q, D_q, [dstend, -32]
+ ret
+
+ /* Align loop64 below to 16 bytes. */
+ nop
+
+ /* Copy more than 128 bytes. */
+L(copy_long):
+ /* Copy 16 bytes and then align src to 16-byte alignment. */
+ ldr D_q, [src]
+ and tmp1, src, 15
+ bic src, src, 15
+ sub dst, dstin, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_q, B_q, [src, 16]
+ str D_q, [dstin]
+ ldp C_q, D_q, [src, 48]
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls L(copy64_from_end)
+L(loop64):
+ stp A_q, B_q, [dst, 16]
+ ldp A_q, B_q, [src, 80]
+ stp C_q, D_q, [dst, 48]
+ ldp C_q, D_q, [src, 112]
+ add src, src, 64
+ add dst, dst, 64
+ subs count, count, 64
+ b.hi L(loop64)
+
+ /* Write the last iteration and copy 64 bytes from the end. */
+L(copy64_from_end):
+ ldp E_q, F_q, [srcend, -64]
+ stp A_q, B_q, [dst, 16]
+ ldp A_q, B_q, [srcend, -32]
+ stp C_q, D_q, [dst, 48]
+ stp E_q, F_q, [dstend, -64]
+ stp A_q, B_q, [dstend, -32]
+ ret
+
+END (__memcpy_simd)
+libc_hidden_builtin_def (__memcpy_simd)
+
+
+ENTRY (__memmove_simd)
+ DELOUSE (0)
+ DELOUSE (1)
+ DELOUSE (2)
+
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 128
+ b.hi L(move_long)
+ cmp count, 32
+ b.hi L(copy32_128)
+
+ /* Small moves: 0..32 bytes. */
+ cmp count, 16
+ b.lo L(copy16)
+ ldr A_q, [src]
+ ldr B_q, [srcend, -16]
+ str A_q, [dstin]
+ str B_q, [dstend, -16]
+ ret
+
+L(move_long):
+ /* Only use backward copy if there is an overlap. */
+ sub tmp1, dstin, src
+ cbz tmp1, L(move0)
+ cmp tmp1, count
+ b.hs L(copy_long)
+
+ /* Large backwards copy for overlapping copies.
+ Copy 16 bytes and then align srcend to 16-byte alignment. */
+L(copy_long_backwards):
+ ldr D_q, [srcend, -16]
+ and tmp1, srcend, 15
+ bic srcend, srcend, 15
+ sub count, count, tmp1
+ ldp A_q, B_q, [srcend, -32]
+ str D_q, [dstend, -16]
+ ldp C_q, D_q, [srcend, -64]
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls L(copy64_from_start)
+
+L(loop64_backwards):
+ str B_q, [dstend, -16]
+ str A_q, [dstend, -32]
+ ldp A_q, B_q, [srcend, -96]
+ str D_q, [dstend, -48]
+ str C_q, [dstend, -64]!
+ ldp C_q, D_q, [srcend, -128]
+ sub srcend, srcend, 64
+ subs count, count, 64
+ b.hi L(loop64_backwards)
+
+ /* Write the last iteration and copy 64 bytes from the start. */
+L(copy64_from_start):
+ ldp E_q, F_q, [src, 32]
+ stp A_q, B_q, [dstend, -32]
+ ldp A_q, B_q, [src]
+ stp C_q, D_q, [dstend, -64]
+ stp E_q, F_q, [dstin, 32]
+ stp A_q, B_q, [dstin]
+L(move0):
+ ret
+
+END (__memmove_simd)
+libc_hidden_builtin_def (__memmove_simd)
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
index e69d816291..b426dad834 100644
--- a/sysdeps/aarch64/multiarch/memmove.c
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -29,6 +29,7 @@
extern __typeof (__redirect_memmove) __libc_memmove;
extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
@@ -37,7 +38,10 @@ libc_ifunc (__libc_memmove,
? __memmove_thunderx
: (IS_FALKOR (midr) || IS_PHECDA (midr)
? __memmove_falkor
- : __memmove_generic)));
+ : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)
+ || IS_NEOVERSE_V1 (midr)
+ ? __memmove_simd
+ : __memmove_generic))));
# undef memmove
strong_alias (__libc_memmove, memmove);
diff --git a/sysdeps/aarch64/sysdep.h b/sysdeps/aarch64/sysdep.h
index 5b30709436..509e3e14c5 100644
--- a/sysdeps/aarch64/sysdep.h
+++ b/sysdeps/aarch64/sysdep.h
@@ -45,7 +45,7 @@
#define ENTRY(name) \
.globl C_SYMBOL_NAME(name); \
.type C_SYMBOL_NAME(name),%function; \
- .align 4; \
+ .p2align 6; \
C_LABEL(name) \
cfi_startproc; \
CALL_MCOUNT
diff --git a/sysdeps/i386/dl-machine.h b/sysdeps/i386/dl-machine.h
index f6cfb90e21..1e3ef25498 100644
--- a/sysdeps/i386/dl-machine.h
+++ b/sysdeps/i386/dl-machine.h
@@ -338,16 +338,22 @@ elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc,
{
# ifndef RTLD_BOOTSTRAP
if (sym_map != map
- && sym_map->l_type != lt_executable
&& !sym_map->l_relocated)
{
const char *strtab
= (const char *) D_PTR (map, l_info[DT_STRTAB]);
- _dl_error_printf ("\
+ if (sym_map->l_type == lt_executable)
+ _dl_fatal_printf ("\
+%s: IFUNC symbol '%s' referenced in '%s' is defined in the executable \
+and creates an unsatisfiable circular dependency.\n",
+ RTLD_PROGNAME, strtab + refsym->st_name,
+ map->l_name);
+ else
+ _dl_error_printf ("\
%s: Relink `%s' with `%s' for IFUNC symbol `%s'\n",
- RTLD_PROGNAME, map->l_name,
- sym_map->l_name,
- strtab + refsym->st_name);
+ RTLD_PROGNAME, map->l_name,
+ sym_map->l_name,
+ strtab + refsym->st_name);
}
# endif
value = ((Elf32_Addr (*) (void)) value) ();
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
index 153d258afe..fbe1148652 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
@@ -51,8 +51,12 @@
#define IS_PHECDA(midr) (MIDR_IMPLEMENTOR(midr) == 'h' \
&& MIDR_PARTNUM(midr) == 0x000)
-#define IS_ARES(midr) (MIDR_IMPLEMENTOR(midr) == 'A' \
- && MIDR_PARTNUM(midr) == 0xd0c)
+#define IS_NEOVERSE_N1(midr) (MIDR_IMPLEMENTOR(midr) == 'A' \
+ && MIDR_PARTNUM(midr) == 0xd0c)
+#define IS_NEOVERSE_N2(midr) (MIDR_IMPLEMENTOR(midr) == 'A' \
+ && MIDR_PARTNUM(midr) == 0xd49)
+#define IS_NEOVERSE_V1(midr) (MIDR_IMPLEMENTOR(midr) == 'A' \
+ && MIDR_PARTNUM(midr) == 0xd40)
struct cpu_features
{
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index 43ad4a79ff..891a4fdbb0 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -12,6 +12,12 @@ endif
ifeq ($(subdir),setjmp)
gen-as-const-headers += jmp_buf-ssp.sym
sysdep_routines += __longjmp_cancel
+ifneq ($(enable-cet),no)
+ifneq ($(have-tunables),no)
+tests += tst-setjmp-cet
+tst-setjmp-cet-ENV = GLIBC_TUNABLES=glibc.cpu.x86_ibt=on:glibc.cpu.x86_shstk=on
+endif
+endif
endif
ifeq ($(enable-cet),yes)
diff --git a/sysdeps/x86/dl-cet.c b/sysdeps/x86/dl-cet.c
index b82ba14e75..56c37bcd4f 100644
--- a/sysdeps/x86/dl-cet.c
+++ b/sysdeps/x86/dl-cet.c
@@ -105,7 +105,11 @@ dl_cet_check (struct link_map *m, const char *program)
/* No legacy object check if both IBT and SHSTK are always on. */
if (enable_ibt_type == CET_ALWAYS_ON
&& enable_shstk_type == CET_ALWAYS_ON)
- return;
+ {
+ THREAD_SETMEM (THREAD_SELF, header.feature_1,
+ GL(dl_x86_feature_1)[0]);
+ return;
+ }
/* Check if IBT is enabled by kernel. */
bool ibt_enabled
diff --git a/sysdeps/x86/tst-setjmp-cet.c b/sysdeps/x86/tst-setjmp-cet.c
new file mode 100644
index 0000000000..42c795d2a8
--- /dev/null
+++ b/sysdeps/x86/tst-setjmp-cet.c
@@ -0,0 +1 @@
+#include <setjmp/tst-setjmp.c>
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
index 1942ed5061..c5fdcd3d39 100644
--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@@ -315,16 +315,22 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc,
{
# ifndef RTLD_BOOTSTRAP
if (sym_map != map
- && sym_map->l_type != lt_executable
&& !sym_map->l_relocated)
{
const char *strtab
= (const char *) D_PTR (map, l_info[DT_STRTAB]);
- _dl_error_printf ("\
+ if (sym_map->l_type == lt_executable)
+ _dl_fatal_printf ("\
+%s: IFUNC symbol '%s' referenced in '%s' is defined in the executable \
+and creates an unsatisfiable circular dependency.\n",
+ RTLD_PROGNAME, strtab + refsym->st_name,
+ map->l_name);
+ else
+ _dl_error_printf ("\
%s: Relink `%s' with `%s' for IFUNC symbol `%s'\n",
- RTLD_PROGNAME, map->l_name,
- sym_map->l_name,
- strtab + refsym->st_name);
+ RTLD_PROGNAME, map->l_name,
+ sym_map->l_name,
+ strtab + refsym->st_name);
}
# endif
value = ((ElfW(Addr) (*) (void)) value) ();
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 9bab1147d5..5aaadc233f 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -67,6 +67,13 @@
# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
#endif
+/* Avoid short distance rep movsb only with non-SSE vector. */
+#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
+# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
+#else
+# define AVOID_SHORT_DISTANCE_REP_MOVSB 0
+#endif
+
#ifndef PREFETCH
# define PREFETCH(addr) prefetcht0 addr
#endif
@@ -257,7 +264,21 @@ L(movsb):
# error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
# endif
jb L(more_8x_vec_backward)
+# if AVOID_SHORT_DISTANCE_REP_MOVSB
+ movq %rdi, %rcx
+ subq %rsi, %rcx
+ jmp 2f
+# endif
1:
+# if AVOID_SHORT_DISTANCE_REP_MOVSB
+ movq %rsi, %rcx
+ subq %rdi, %rcx
+2:
+/* Avoid "rep movsb" if RCX, the distance between source and destination,
+ is N*4GB + [1..63] with N >= 0. */
+ cmpl $63, %ecx
+ jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */
+# endif
mov %RDX_LP, %RCX_LP
rep movsb
L(nop):