aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2016-04-24 10:53:25 -0700
committerH.J. Lu <hjl.tools@gmail.com>2016-04-25 08:32:09 -0700
commit343b5e49525c4c936643418300ea16437256b1e0 (patch)
treeb4aa0c86b1b6e25b335d4e915d19f3835c7741a8
parent8dd19b0b3ca334060eec990f0afa502700939ad3 (diff)
downloadglibc-343b5e49525c4c936643418300ea16437256b1e0.tar
glibc-343b5e49525c4c936643418300ea16437256b1e0.tar.gz
glibc-343b5e49525c4c936643418300ea16437256b1e0.tar.bz2
glibc-343b5e49525c4c936643418300ea16437256b1e0.zip
Align to cacheline
-rw-r--r--sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S93
1 files changed, 93 insertions, 0 deletions
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 0a2bf4108f..aaee527dca 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -466,6 +466,26 @@ L(large_forward):
leaq (%rdi, %rdx), %r10
cmpq %r10, %rsi
jb L(loop_4x_vec_forward)
+# if CACHELINE_SIZE != VEC_SIZE
+ movl %edi, %r8d
+ andl $(CACHELINE_SIZE - 1), %r8d
+ je L(loop_large_forward)
+# if CACHELINE_SIZE == (VEC_SIZE * 4)
+ /* Cacheline misaligned by VEC_SIZE, 2 * VEC_SIZE, or
+ 3 * VEC_SIZE. */
+ cmpl $(VEC_SIZE * 2), %r8d
+ je L(misaligned_by_2x_vec_forward)
+ jb L(misaligned_by_3x_vec_forward)
+# elif CACHELINE_SIZE != (VEC_SIZE * 2)
+# error Unsupported CACHELINE_SIZE!
+# endif
+ /* Cacheline misaligned by VEC_SIZE. */
+ VMOVU (%rsi), %VEC(0)
+ addq $VEC_SIZE, %rsi
+ subq $VEC_SIZE, %rdx
+ VMOVA %VEC(0), (%rdi)
+ addq $VEC_SIZE, %rdi
+# endif
L(loop_large_forward):
/* Copy 4 * VEC a time forward with non-temporal stores. */
PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
@@ -494,6 +514,32 @@ L(loop_large_forward):
VZEROUPPER
ret
+# if CACHELINE_SIZE == (VEC_SIZE * 4)
+L(misaligned_by_2x_vec_forward):
+ /* Cacheline misaligned by 2 * VEC_SIZE. */
+ VMOVU (%rsi), %VEC(0)
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+ addq $(VEC_SIZE * 2), %rsi
+ subq $(VEC_SIZE * 2), %rdx
+ VMOVA %VEC(0), (%rdi)
+ VMOVA %VEC(1), VEC_SIZE(%rdi)
+ addq $(VEC_SIZE * 2), %rdi
+ jmp L(loop_large_forward)
+
+L(misaligned_by_3x_vec_forward):
+ /* Cacheline misaligned by 3 * VEC_SIZE. */
+ VMOVU (%rsi), %VEC(0)
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
+ addq $(VEC_SIZE * 3), %rsi
+ subq $(VEC_SIZE * 3), %rdx
+ VMOVA %VEC(0), (%rdi)
+ VMOVA %VEC(1), VEC_SIZE(%rdi)
+ VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
+ addq $(VEC_SIZE * 3), %rdi
+ jmp L(loop_large_forward)
+# endif
+
L(large_backward):
/* Don't use non-temporal store if there is overlap between
destination and source since destination may be in cache
@@ -501,6 +547,26 @@ L(large_backward):
leaq (%rcx, %rdx), %r10
cmpq %r10, %r9
jb L(loop_4x_vec_backward)
+# if CACHELINE_SIZE != VEC_SIZE
+ movl %r9d, %r8d
+ andl $(CACHELINE_SIZE - 1), %r8d
+ je L(loop_large_backward)
+# if CACHELINE_SIZE == (VEC_SIZE * 4)
+ /* Cacheline misaligned by VEC_SIZE, 2 * VEC_SIZE, or
+ 3 * VEC_SIZE. */
+ cmpl $(VEC_SIZE * 2), %r8d
+ je L(misaligned_by_2x_vec_backward)
+ jb L(misaligned_by_3x_vec_backward)
+# elif CACHELINE_SIZE != (VEC_SIZE * 2)
+# error Unsupported CACHELINE_SIZE!
+# endif
+ /* Cacheline misaligned by VEC_SIZE. */
+ VMOVU (%rcx), %VEC(0)
+ subq $VEC_SIZE, %rcx
+ subq $VEC_SIZE, %rdx
+ VMOVA %VEC(0), (%r9)
+ subq $VEC_SIZE, %r9
+# endif
L(loop_large_backward):
/* Copy 4 * VEC a time backward with non-temporal stores. */
PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
@@ -528,6 +594,33 @@ L(loop_large_backward):
VMOVU %VEC(8), (%r11)
VZEROUPPER
ret
+
+# if CACHELINE_SIZE == (VEC_SIZE * 4)
+L(misaligned_by_2x_vec_backward):
+ /* Cacheline misaligned by 2 * VEC_SIZE. */
+ VMOVU (%rcx), %VEC(0)
+ VMOVU -VEC_SIZE(%rcx), %VEC(1)
+ subq $(VEC_SIZE * 2), %rcx
+ subq $(VEC_SIZE * 2), %rdx
+ VMOVA %VEC(0), (%r9)
+ VMOVA %VEC(1), -VEC_SIZE(%r9)
+ subq $(VEC_SIZE * 2), %r9
+ jmp L(loop_large_backward)
+
+L(misaligned_by_3x_vec_backward):
+ /* Cacheline misaligned by 3 * VEC_SIZE. */
+ VMOVU (%rcx), %VEC(0)
+ VMOVU -VEC_SIZE(%rcx), %VEC(1)
+ VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
+ subq $(VEC_SIZE * 3), %rcx
+ subq $(VEC_SIZE * 3), %rdx
+ VMOVA %VEC(0), (%r9)
+ VMOVA %VEC(1), -VEC_SIZE(%r9)
+ VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
+ subq $(VEC_SIZE * 3), %r9
+ jmp L(loop_large_backward)
+# endif
+
#endif
END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))