diff options
author | Noah Goldstein <goldstein.w.n@gmail.com> | 2022-07-12 12:29:02 -0700 |
---|---|---|
committer | Noah Goldstein <goldstein.w.n@gmail.com> | 2022-07-13 14:55:31 -0700 |
commit | 6b9006bfb03c5975f31de286311041d3c933f5ac (patch) | |
tree | d8af4955e33ee2c23a1641e9156447d556d2ca6a /sysdeps/x86_64/multiarch/strcpy-sse2.S | |
parent | 58e6cd4bcbe9f29949f1545953a17145bf732aa0 (diff) | |
download | glibc-6b9006bfb03c5975f31de286311041d3c933f5ac.tar glibc-6b9006bfb03c5975f31de286311041d3c933f5ac.tar.gz glibc-6b9006bfb03c5975f31de286311041d3c933f5ac.tar.bz2 glibc-6b9006bfb03c5975f31de286311041d3c933f5ac.zip |
x86: Move strcpy SSE2 implementation to multiarch/strcpy-sse2.S
This commit doesn't affect libc.so.6, its just housekeeping to prepare
for adding explicit ISA level support.
Tested build on x86_64 and x86_32 with/without multiarch.
Diffstat (limited to 'sysdeps/x86_64/multiarch/strcpy-sse2.S')
-rw-r--r-- | sysdeps/x86_64/multiarch/strcpy-sse2.S | 137 |
1 files changed, 131 insertions, 6 deletions
diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2.S b/sysdeps/x86_64/multiarch/strcpy-sse2.S index f37967c441..8b5db8b13d 100644 --- a/sysdeps/x86_64/multiarch/strcpy-sse2.S +++ b/sysdeps/x86_64/multiarch/strcpy-sse2.S @@ -17,12 +17,137 @@ <https://www.gnu.org/licenses/>. */ #if IS_IN (libc) +# ifndef STRCPY +# define STRCPY __strcpy_sse2 +# endif +#endif -# include <sysdep.h> -# define strcpy __strcpy_sse2 +#include <sysdep.h> -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(strcpy) -#endif + .text +ENTRY (STRCPY) + movq %rsi, %rcx /* Source register. */ + andl $7, %ecx /* mask alignment bits */ + movq %rdi, %rdx /* Duplicate destination pointer. */ + + jz 5f /* aligned => start loop */ + + neg %ecx /* We need to align to 8 bytes. */ + addl $8,%ecx + /* Search the first bytes directly. */ +0: + movb (%rsi), %al /* Fetch a byte */ + testb %al, %al /* Is it NUL? */ + movb %al, (%rdx) /* Store it */ + jz 4f /* If it was NUL, done! */ + incq %rsi + incq %rdx + decl %ecx + jnz 0b + +5: + movq $0xfefefefefefefeff,%r8 + + /* Now the sources is aligned. Unfortunatly we cannot force + to have both source and destination aligned, so ignore the + alignment of the destination. */ + .p2align 4 +1: + /* 1st unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 3f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + + /* 2nd unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 3f /* found NUL => return pointer */ -#include <sysdeps/x86_64/strcpy.S> + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + + /* 3rd unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 3f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + + /* 4th unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 3f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + jmp 1b /* Next iteration. */ + + /* Do the last few bytes. %rax contains the value to write. + The loop is unrolled twice. */ + .p2align 4 +3: + /* Note that stpcpy needs to return with the value of the NUL + byte. */ + movb %al, (%rdx) /* 1st byte. */ + testb %al, %al /* Is it NUL. */ + jz 4f /* yes, finish. */ + incq %rdx /* Increment destination. */ + movb %ah, (%rdx) /* 2nd byte. */ + testb %ah, %ah /* Is it NUL?. */ + jz 4f /* yes, finish. */ + incq %rdx /* Increment destination. */ + shrq $16, %rax /* Shift... */ + jmp 3b /* and look at next two bytes in %rax. */ + +4: +#ifdef USE_AS_STPCPY + movq %rdx, %rax /* Destination is return value. */ +#else + movq %rdi, %rax /* Source is return value. */ +#endif + retq +END (STRCPY) |