diff options
author | H.J. Lu <hjl.tools@gmail.com> | 2015-07-29 03:44:39 -0700 |
---|---|---|
committer | H.J. Lu <hjl.tools@gmail.com> | 2015-08-03 14:32:48 -0700 |
commit | 1cb03e05847b1e8e68a06473f0fedf79ecf49005 (patch) | |
tree | ecac5f808ec2883786922a3161b9467e59e970fa /sysdeps/x86_64 | |
parent | 0b92f51d8303a5148aa99dc101d1e73244199a61 (diff) | |
download | glibc-1cb03e05847b1e8e68a06473f0fedf79ecf49005.tar glibc-1cb03e05847b1e8e68a06473f0fedf79ecf49005.tar.gz glibc-1cb03e05847b1e8e68a06473f0fedf79ecf49005.tar.bz2 glibc-1cb03e05847b1e8e68a06473f0fedf79ecf49005.zip |
Replace %xmm[8-12] with %xmm[0-4]
Since ld.so preserves vector registers now, we can use %xmm[0-4] to
avoid the REX prefix.
* sysdeps/x86_64/strlen.S: Replace %xmm[8-12] with %xmm[0-4].
Diffstat (limited to 'sysdeps/x86_64')
-rw-r--r-- | sysdeps/x86_64/strlen.S | 94 |
1 files changed, 47 insertions, 47 deletions
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S index c382c8d23e..07253330cf 100644 --- a/sysdeps/x86_64/strlen.S +++ b/sysdeps/x86_64/strlen.S @@ -20,7 +20,7 @@ /* Long lived register in strlen(s), strnlen(s, n) are: - %xmm11 - zero + %xmm3 - zero %rdi - s %r10 (s+n) & (~(64-1)) %r11 s+n @@ -32,14 +32,14 @@ ENTRY(strlen) /* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ #define FIND_ZERO \ - pcmpeqb (%rax), %xmm8; \ - pcmpeqb 16(%rax), %xmm9; \ - pcmpeqb 32(%rax), %xmm10; \ - pcmpeqb 48(%rax), %xmm11; \ - pmovmskb %xmm8, %esi; \ - pmovmskb %xmm9, %edx; \ - pmovmskb %xmm10, %r8d; \ - pmovmskb %xmm11, %ecx; \ + pcmpeqb (%rax), %xmm0; \ + pcmpeqb 16(%rax), %xmm1; \ + pcmpeqb 32(%rax), %xmm2; \ + pcmpeqb 48(%rax), %xmm3; \ + pmovmskb %xmm0, %esi; \ + pmovmskb %xmm1, %edx; \ + pmovmskb %xmm2, %r8d; \ + pmovmskb %xmm3, %ecx; \ salq $16, %rdx; \ salq $16, %rcx; \ orq %rsi, %rdx; \ @@ -63,10 +63,10 @@ L(n_nonzero): mov %rsi, %r11 #endif - pxor %xmm8, %xmm8 - pxor %xmm9, %xmm9 - pxor %xmm10, %xmm10 - pxor %xmm11, %xmm11 + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 movq %rdi, %rax movq %rdi, %rcx andq $4095, %rcx @@ -103,9 +103,9 @@ L(n_nonzero): FIND_ZERO #else /* Test first 16 bytes unaligned. */ - movdqu (%rax), %xmm12 - pcmpeqb %xmm8, %xmm12 - pmovmskb %xmm12, %edx + movdqu (%rax), %xmm4 + pcmpeqb %xmm0, %xmm4 + pmovmskb %xmm4, %edx test %edx, %edx je L(next48_bytes) bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ @@ -114,12 +114,12 @@ L(n_nonzero): L(next48_bytes): /* Same as FIND_ZERO except we do not check first 16 bytes. */ andq $-16, %rax - pcmpeqb 16(%rax), %xmm9 - pcmpeqb 32(%rax), %xmm10 - pcmpeqb 48(%rax), %xmm11 - pmovmskb %xmm9, %edx - pmovmskb %xmm10, %r8d - pmovmskb %xmm11, %ecx + pcmpeqb 16(%rax), %xmm1 + pcmpeqb 32(%rax), %xmm2 + pcmpeqb 48(%rax), %xmm3 + pmovmskb %xmm1, %edx + pmovmskb %xmm2, %r8d + pmovmskb %xmm3, %ecx salq $16, %rdx salq $16, %rcx orq %r8, %rcx @@ -127,7 +127,7 @@ L(next48_bytes): orq %rcx, %rdx #endif - /* When no zero byte is found xmm9-11 are zero so we do not have to + /* When no zero byte is found xmm1-3 are zero so we do not have to zero them. */ PROLOG(loop) @@ -149,9 +149,9 @@ L(strnlen_ret): #endif .p2align 4 L(loop_init): - pxor %xmm9, %xmm9 - pxor %xmm10, %xmm10 - pxor %xmm11, %xmm11 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 #ifdef AS_STRNLEN .p2align 4 L(loop): @@ -160,12 +160,12 @@ L(loop): cmpq %rax, %r10 je L(exit_end) - movdqa (%rax), %xmm8 - pminub 16(%rax), %xmm8 - pminub 32(%rax), %xmm8 - pminub 48(%rax), %xmm8 - pcmpeqb %xmm11, %xmm8 - pmovmskb %xmm8, %edx + movdqa (%rax), %xmm0 + pminub 16(%rax), %xmm0 + pminub 32(%rax), %xmm0 + pminub 48(%rax), %xmm0 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx testl %edx, %edx jne L(exit) jmp L(loop) @@ -174,7 +174,7 @@ L(loop): L(exit_end): cmp %rax, %r11 je L(first) /* Do not read when end is at page boundary. */ - pxor %xmm8, %xmm8 + pxor %xmm0, %xmm0 FIND_ZERO L(first): @@ -186,7 +186,7 @@ L(first): .p2align 4 L(exit): - pxor %xmm8, %xmm8 + pxor %xmm0, %xmm0 FIND_ZERO bsfq %rdx, %rdx @@ -200,23 +200,23 @@ L(exit): .p2align 4 L(loop): - movdqa 64(%rax), %xmm8 - pminub 80(%rax), %xmm8 - pminub 96(%rax), %xmm8 - pminub 112(%rax), %xmm8 - pcmpeqb %xmm11, %xmm8 - pmovmskb %xmm8, %edx + movdqa 64(%rax), %xmm0 + pminub 80(%rax), %xmm0 + pminub 96(%rax), %xmm0 + pminub 112(%rax), %xmm0 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx testl %edx, %edx jne L(exit64) subq $-128, %rax - movdqa (%rax), %xmm8 - pminub 16(%rax), %xmm8 - pminub 32(%rax), %xmm8 - pminub 48(%rax), %xmm8 - pcmpeqb %xmm11, %xmm8 - pmovmskb %xmm8, %edx + movdqa (%rax), %xmm0 + pminub 16(%rax), %xmm0 + pminub 32(%rax), %xmm0 + pminub 48(%rax), %xmm0 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx testl %edx, %edx jne L(exit0) jmp L(loop) @@ -225,7 +225,7 @@ L(loop): L(exit64): addq $64, %rax L(exit0): - pxor %xmm8, %xmm8 + pxor %xmm0, %xmm0 FIND_ZERO bsfq %rdx, %rdx |