diff options
Diffstat (limited to 'sysdeps/x86_64/multiarch/wcscpy-ssse3.S')
-rw-r--r-- | sysdeps/x86_64/multiarch/wcscpy-ssse3.S | 64 |
1 files changed, 26 insertions, 38 deletions
diff --git a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S index 4e292f3c2b..477b2cb4ef 100644 --- a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S +++ b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S @@ -21,8 +21,9 @@ #ifndef NOT_IN_libc # include <sysdep.h> -.text + .section .text.ssse3,"ax",@progbits ENTRY (__wcscpy_ssse3) + mov %rsi, %rcx mov %rdi, %rdx @@ -136,6 +137,7 @@ L(Align16Both): mov $-0x40, %rsi + .p2align 4 L(Aligned64Loop): movaps (%rcx), %xmm2 movaps %xmm2, %xmm4 @@ -205,7 +207,6 @@ L(Shl4Start): jnz L(Shl4LoopExit) palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 28(%rcx), %xmm2 @@ -213,15 +214,14 @@ L(Shl4Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 test %rax, %rax jnz L(Shl4LoopExit) - palignr $4, %xmm1, %xmm2 + palignr $4, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 28(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqd %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -233,7 +233,6 @@ L(Shl4Start): jnz L(Shl4LoopExit) palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 28(%rcx), %xmm2 @@ -245,8 +244,7 @@ L(Shl4Start): test %rax, %rax jnz L(Shl4LoopExit) - palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $4, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 28(%rcx), %rcx lea 16(%rdx), %rdx @@ -259,6 +257,7 @@ L(Shl4Start): movaps -4(%rcx), %xmm1 + .p2align 4 L(Shl4LoopStart): movaps 12(%rcx), %xmm2 movaps 28(%rcx), %xmm3 @@ -289,11 +288,9 @@ L(Shl4LoopStart): jmp L(Shl4LoopStart) L(Shl4LoopExit): - movaps (%rdx), %xmm6 - psrldq $12, %xmm6 + movdqu -4(%rcx), %xmm1 mov $12, %rsi - palignr $4, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + movdqu %xmm1, -4(%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -309,7 +306,6 @@ L(Shl8Start): jnz L(Shl8LoopExit) palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 24(%rcx), %xmm2 @@ -317,15 +313,14 @@ L(Shl8Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 test %rax, %rax jnz L(Shl8LoopExit) - palignr $8, %xmm1, %xmm2 + palignr $8, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 24(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqd %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -337,7 +332,6 @@ L(Shl8Start): jnz L(Shl8LoopExit) palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 24(%rcx), %xmm2 @@ -345,13 +339,11 @@ L(Shl8Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 test %rax, %rax jnz L(Shl8LoopExit) - palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $8, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 24(%rcx), %rcx lea 16(%rdx), %rdx @@ -364,6 +356,7 @@ L(Shl8Start): movaps -8(%rcx), %xmm1 + .p2align 4 L(Shl8LoopStart): movaps 8(%rcx), %xmm2 movaps 24(%rcx), %xmm3 @@ -394,11 +387,9 @@ L(Shl8LoopStart): jmp L(Shl8LoopStart) L(Shl8LoopExit): - movaps (%rdx), %xmm6 - psrldq $8, %xmm6 + mov (%rcx), %r9 mov $8, %rsi - palignr $8, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9, (%rdx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -414,7 +405,6 @@ L(Shl12Start): jnz L(Shl12LoopExit) palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 20(%rcx), %xmm2 @@ -422,15 +412,14 @@ L(Shl12Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 + movaps %xmm2, %xmm1 test %rax, %rax jnz L(Shl12LoopExit) - palignr $12, %xmm1, %xmm2 + palignr $12, %xmm3, %xmm2 movaps %xmm2, (%rdx) movaps 20(%rcx), %xmm2 - movaps %xmm3, %xmm1 pcmpeqd %xmm2, %xmm0 lea 16(%rdx), %rdx @@ -442,7 +431,6 @@ L(Shl12Start): jnz L(Shl12LoopExit) palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%rdx) movaps 20(%rcx), %xmm2 @@ -450,13 +438,11 @@ L(Shl12Start): lea 16(%rdx), %rdx pmovmskb %xmm0, %rax lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 test %rax, %rax jnz L(Shl12LoopExit) - palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $12, %xmm3, %xmm2 movaps %xmm2, (%rdx) lea 20(%rcx), %rcx lea 16(%rdx), %rdx @@ -469,6 +455,7 @@ L(Shl12Start): movaps -12(%rcx), %xmm1 + .p2align 4 L(Shl12LoopStart): movaps 4(%rcx), %xmm2 movaps 20(%rcx), %xmm3 @@ -498,11 +485,10 @@ L(Shl12LoopStart): jmp L(Shl12LoopStart) L(Shl12LoopExit): - movaps (%rdx), %xmm6 - psrldq $4, %xmm6 + mov (%rcx), %r9d mov $4, %rsi - palignr $12, %xmm1, %xmm6 - movaps %xmm6, (%rdx) + mov %r9d, (%rdx) + jmp L(CopyFrom1To16Bytes) .p2align 4 L(CopyFrom1To16Bytes): @@ -556,8 +542,10 @@ L(Exit12): .p2align 4 L(Exit16): - movdqu (%rcx), %xmm0 - movdqu %xmm0, (%rdx) + mov (%rcx), %rax + mov %rax, (%rdx) + mov 8(%rcx), %rax + mov %rax, 8(%rdx) mov %rdi, %rax ret |