aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/x86_64/memchr.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/x86_64/memchr.S')
-rw-r--r--sysdeps/x86_64/memchr.S78
1 files changed, 68 insertions, 10 deletions
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index 891ee70aef..205345b43d 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -20,8 +20,17 @@
/* fast SSE2 version with using pmaxub and 64 byte loop */
+# ifdef __CHKP__
+# define RETURN \
+ bndcu (%rax), %bnd0; \
+ ret
+# else
+# define RETURN ret
+# endif
+
.text
ENTRY(memchr)
+
movd %rsi, %xmm1
mov %rdi, %rcx
@@ -33,6 +42,10 @@ ENTRY(memchr)
and $63, %rcx
pshufd $0, %xmm1, %xmm1
+#ifdef __CHKP__
+ bndcl (%rdi), %bnd0
+ bndcu (%rdi), %bnd0
+#endif
cmp $48, %rcx
ja L(crosscache)
@@ -72,7 +85,7 @@ L(crosscache):
jbe L(return_null)
add %rdi, %rax
add %rcx, %rax
- ret
+ RETURN
.p2align 4
L(unaligned_no_match):
@@ -85,24 +98,36 @@ L(unaligned_no_match):
.p2align 4
L(loop_prolog):
+#ifdef __CHKP__
+ bndcu (%rdi), %bnd0
+#endif
movdqa (%rdi), %xmm0
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(matches)
+#ifdef __CHKP__
+ bndcu 16(%rdi), %bnd0
+#endif
movdqa 16(%rdi), %xmm2
pcmpeqb %xmm1, %xmm2
pmovmskb %xmm2, %eax
test %eax, %eax
jnz L(matches16)
+#ifdef __CHKP__
+ bndcu 32(%rdi), %bnd0
+#endif
movdqa 32(%rdi), %xmm3
pcmpeqb %xmm1, %xmm3
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L(matches32)
+#ifdef __CHKP__
+ bndcu 48(%rdi), %bnd0
+#endif
movdqa 48(%rdi), %xmm4
pcmpeqb %xmm1, %xmm4
add $64, %rdi
@@ -116,24 +141,36 @@ L(loop_prolog):
sub $64, %rdx
jbe L(exit_loop)
+#ifdef __CHKP__
+ bndcu (%rdi), %bnd0
+#endif
movdqa (%rdi), %xmm0
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(matches)
+#ifdef __CHKP__
+ bndcu 16(%rdi), %bnd0
+#endif
movdqa 16(%rdi), %xmm2
pcmpeqb %xmm1, %xmm2
pmovmskb %xmm2, %eax
test %eax, %eax
jnz L(matches16)
+#ifdef __CHKP__
+ bndcu 32(%rdi), %bnd0
+#endif
movdqa 32(%rdi), %xmm3
pcmpeqb %xmm1, %xmm3
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L(matches32)
+#ifdef __CHKP__
+ bndcu 48(%rdi), %bnd0
+#endif
movdqa 48(%rdi), %xmm3
pcmpeqb %xmm1, %xmm3
pmovmskb %xmm3, %eax
@@ -151,6 +188,9 @@ L(loop_prolog):
L(align64_loop):
sub $64, %rdx
jbe L(exit_loop)
+#ifdef __CHKP__
+ bndcu (%rdi), %bnd0
+#endif
movdqa (%rdi), %xmm0
movdqa 16(%rdi), %xmm2
movdqa 32(%rdi), %xmm3
@@ -192,25 +232,34 @@ L(align64_loop):
pmovmskb %xmm1, %eax
bsf %eax, %eax
lea 48(%rdi, %rax), %rax
- ret
+ RETURN
.p2align 4
L(exit_loop):
add $32, %rdx
jle L(exit_loop_32)
+#ifdef __CHKP__
+ bndcu (%rdi), %bnd0
+#endif
movdqa (%rdi), %xmm0
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(matches)
+#ifdef __CHKP__
+ bndcu 16(%rdi), %bnd0
+#endif
movdqa 16(%rdi), %xmm2
pcmpeqb %xmm1, %xmm2
pmovmskb %xmm2, %eax
test %eax, %eax
jnz L(matches16)
+#ifdef __CHKP__
+ bndcu 32(%rdi), %bnd0
+#endif
movdqa 32(%rdi), %xmm3
pcmpeqb %xmm1, %xmm3
pmovmskb %xmm3, %eax
@@ -219,6 +268,9 @@ L(exit_loop):
sub $16, %rdx
jle L(return_null)
+#ifdef __CHKP__
+ bndcu 48(%rdi), %bnd0
+#endif
pcmpeqb 48(%rdi), %xmm1
pmovmskb %xmm1, %eax
test %eax, %eax
@@ -229,6 +281,9 @@ L(exit_loop):
.p2align 4
L(exit_loop_32):
add $32, %rdx
+#ifdef __CHKP__
+ bndcu (%rdi), %bnd0
+#endif
movdqa (%rdi), %xmm0
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %eax
@@ -237,6 +292,9 @@ L(exit_loop_32):
sub $16, %rdx
jbe L(return_null)
+#ifdef __CHKP__
+ bndcu 16(%rdi), %bnd0
+#endif
pcmpeqb 16(%rdi), %xmm1
pmovmskb %xmm1, %eax
test %eax, %eax
@@ -248,25 +306,25 @@ L(exit_loop_32):
L(matches0):
bsf %eax, %eax
lea -16(%rax, %rdi), %rax
- ret
+ RETURN
.p2align 4
L(matches):
bsf %eax, %eax
add %rdi, %rax
- ret
+ RETURN
.p2align 4
L(matches16):
bsf %eax, %eax
lea 16(%rax, %rdi), %rax
- ret
+ RETURN
.p2align 4
L(matches32):
bsf %eax, %eax
lea 32(%rax, %rdi), %rax
- ret
+ RETURN
.p2align 4
L(matches_1):
@@ -274,7 +332,7 @@ L(matches_1):
sub %rax, %rdx
jbe L(return_null)
add %rdi, %rax
- ret
+ RETURN
.p2align 4
L(matches16_1):
@@ -282,7 +340,7 @@ L(matches16_1):
sub %rax, %rdx
jbe L(return_null)
lea 16(%rdi, %rax), %rax
- ret
+ RETURN
.p2align 4
L(matches32_1):
@@ -290,7 +348,7 @@ L(matches32_1):
sub %rax, %rdx
jbe L(return_null)
lea 32(%rdi, %rax), %rax
- ret
+ RETURN
.p2align 4
L(matches48_1):
@@ -298,7 +356,7 @@ L(matches48_1):
sub %rax, %rdx
jbe L(return_null)
lea 48(%rdi, %rax), %rax
- ret
+ RETURN
.p2align 4
L(return_null):