aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog9
-rw-r--r--sysdeps/i386/i486/bits/string.h189
2 files changed, 170 insertions, 28 deletions
diff --git a/ChangeLog b/ChangeLog
index 629c97134a..42ce9de41d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+1999-04-30 Ulrich Drepper <drepper@cygnus.com>
+
+ * string/bits/string2.h: Fix bugs I introduced in last change.
+
+ * sysdeps/i386/i486/bits/string.h (memcmp): Don't introduce partial
+ register stall.
+ Extend memset optimization.
+ Correct i686 version of memchr.
+
1999-04-30 Roland McGrath <roland@baalperazim.frob.com>
* sysdeps/i386/bzero.c: Add #undef __bzero.
diff --git a/sysdeps/i386/i486/bits/string.h b/sysdeps/i386/i486/bits/string.h
index cb0229b5cd..2aa0b509b8 100644
--- a/sysdeps/i386/i486/bits/string.h
+++ b/sysdeps/i386/i486/bits/string.h
@@ -176,7 +176,7 @@ memcmp (__const void *__s1, __const void *__s2, size_t __n)
"repe; cmpsb\n\t"
"je 1f\n\t"
"sbbl %0,%0\n\t"
- "orb $1,%b0\n"
+ "orl $1,%0\n"
"1:"
: "=a" (__res), "=&S" (__d0), "=&D" (__d1), "=&c" (__d2)
: "0" (0), "1" (__s1), "2" (__s2), "3" (__n)
@@ -189,24 +189,157 @@ memcmp (__const void *__s1, __const void *__s2, size_t __n)
/* Set N bytes of S to C. */
#define _HAVE_STRING_ARCH_memset 1
#define memset(s, c, n) \
- (__extension__ (__builtin_constant_p (c) \
- ? memset (s, c, n) \
- : (__builtin_constant_p (n) \
- ? __memset_gc (s, c, n) \
- : __memset_gg (s, c, n))))
+ (__extension__ (__builtin_constant_p (n) && (n) <= 16 \
+ ? (__builtin_constant_p (c) \
+ ? __memset_gc (s, ((unsigned char) (c)) * 0x01010101, n) \
+ : ((n) == 1 \
+ ? __memset_c1 (s, c) \
+ : __memset_gc (s, c, n))) \
+ : (__builtin_constant_p (c) \
+ ? (__builtin_constant_p (n) \
+ ? __memset_ccn (s, c, n) \
+ : __memset_gg (s, c, n)) \
+ : (__builtin_constant_p (n) \
+ ? __memset_gcn (s, c, n) \
+ : __memset_gg (s, c, n)))))
+
+#define __memset_c1(s, c) ({ void *__s = (s); \
+ *((__uint8_t *) __s) = (__uint8_t) (c); __s; })
+
#define __memset_gc(s, c, n) \
- ((n) == 0 \
- ? (s) \
- : (((n) % 4== 0) \
- ? __memset_gc_by4 (s, c, n) \
- : (((n) % 2 == 0) \
- ? __memset_gc_by2 (s, c, n) \
- : __memset_gg (s, c, n))))
+ ({ void *__s = (s); \
+ __uint32_t *__ts = (__uint32_t *) __s; \
+ __uint8_t __c = (__uint8_t) (c); \
+ \
+ /* We apply a trick here. `gcc' would implement the following \
+ assignments using absolute operands. But this uses to much \
+ memory (7, instead of 4 bytes). */ \
+ if (n >= 5) \
+ __asm__ __volatile__ ("" : "=r" (__c) : "0" (__c)); \
+ \
+ /* This `switch' statement will be removed at compile-time. */ \
+ switch (n) \
+ { \
+ case 15: \
+ *__ts++ = __c * 0x01010101; \
+ case 11: \
+ *__ts++ = __c * 0x01010101; \
+ case 7: \
+ *__ts++ = __c * 0x01010101; \
+ case 3: \
+ *((__uint16_t *) __ts)++ = __c * 0x0101; \
+ *((__uint8_t *) __ts) = __c; \
+ break; \
+ \
+ case 14: \
+ *__ts++ = __c * 0x01010101; \
+ case 10: \
+ *__ts++ = __c * 0x01010101; \
+ case 6: \
+ *__ts++ = __c * 0x01010101; \
+ case 2: \
+ *((__uint16_t *) __ts) = __c * 0x0101; \
+ break; \
+ \
+ case 13: \
+ *__ts++ = __c * 0x01010101; \
+ case 9: \
+ *__ts++ = __c * 0x01010101; \
+ case 5: \
+ *__ts++ = __c * 0x01010101; \
+ case 1: \
+ *((__uint8_t *) __ts) = __c; \
+ break; \
+ \
+ case 16: \
+ *__ts++ = __c * 0x01010101; \
+ case 12: \
+ *__ts++ = __c * 0x01010101; \
+ case 8: \
+ *__ts++ = __c * 0x01010101; \
+ case 4: \
+ *__ts = __c * 0x01010101; \
+ case 0: \
+ break; \
+ } \
+ \
+ __s; })
+
+#define __memset_ccn(s, c, n) \
+ (((n) % 4 == 0) \
+ ? __memset_ccn_by4 (s, ((__uint8_t) (c)) * 0x01010101, n) \
+ : (((n) % 2 == 0) \
+ ? __memset_ccn_by2 (s, ((__uint8_t) (c)) * 0x01010101, n) \
+ : __memset_gg (s, c, n)))
+
+__STRING_INLINE void *__memset_ccn_by4 (void *__s, int __c, size_t __n);
-__STRING_INLINE void *__memset_gc_by4 (void *__s, int __c, size_t __n);
+__STRING_INLINE void *
+__memset_ccn_by4 (void *__s, int __c, size_t __n)
+{
+ register void *__tmp = __s;
+ register unsigned long int __d0;
+#ifdef __i686__
+ __asm__ __volatile__
+ ("cld\n\t"
+ "rep; stosl"
+ : "=&a" (__c), "=&D" (__tmp), "=&c" (__d0)
+ : "0" ((unsigned int) __c), "1" (__tmp), "2" (__n / 4)
+ : "memory", "cc");
+#else
+ __asm__ __volatile__
+ ("1:\n\t"
+ "movl %0,(%1)\n\t"
+ "addl $4,%1\n\t"
+ "decl %2\n\t"
+ "jnz 1b\n"
+ : "=&q" (__c), "=&r" (__tmp), "=&r" (__d0)
+ : "0" ((unsigned int) __c), "1" (__tmp), "2" (__n / 4)
+ : "memory", "cc");
+#endif
+ return __s;
+}
+
+__STRING_INLINE void *__memset_ccn_by2 (void *__s, int __c, size_t __n);
__STRING_INLINE void *
-__memset_gc_by4 (void *__s, int __c, size_t __n)
+__memset_ccn_by2 (void *__s, int __c, size_t __n)
+{
+ register unsigned long int __d0, __d1;
+ register void *__tmp = __s;
+#ifdef __i686__
+ __asm__ __volatile__
+ ("cld\n\t"
+ "rep; stosl\n"
+ "stosw"
+ : "=&a" (__d0), "=&D" (__tmp), "=&c" (__d1)
+ : "0" ((unsigned int) __c), "1" (__tmp), "2" (__n / 4)
+ : "memory", "cc");
+#else
+ __asm__ __volatile__
+ ("1:\tmovl %0,(%1)\n\t"
+ "leal 4(%1),%1\n\t"
+ "decl %2\n\t"
+ "jnz 1b\n"
+ "movw %w0,(%1)"
+ : "=&q" (__d0), "=&r" (__tmp), "=&r" (__d1)
+ : "0" ((unsigned int) __c), "1" (__tmp), "2" (__n / 4)
+ : "memory", "cc");
+#endif
+ return __s;
+}
+
+#define __memset_gcn(s, c, n) \
+ (((n) % 4 == 0) \
+ ? __memset_gcn_by4 (s, c, n) \
+ : (((n) % 2 == 0) \
+ ? __memset_gcn_by2 (s, c, n) \
+ : __memset_gg (s, c, n)))
+
+__STRING_INLINE void *__memset_gcn_by4 (void *__s, int __c, size_t __n);
+
+__STRING_INLINE void *
+__memset_gcn_by4 (void *__s, int __c, size_t __n)
{
register void *__tmp = __s;
register unsigned long int __d0;
@@ -226,17 +359,15 @@ __memset_gc_by4 (void *__s, int __c, size_t __n)
return __s;
}
-__STRING_INLINE void *__memset_gc_by2 (void *__s, int __c, size_t __n);
+__STRING_INLINE void *__memset_gcn_by2 (void *__s, int __c, size_t __n);
__STRING_INLINE void *
-__memset_gc_by2 (void *__s, int __c, size_t __n)
+__memset_gcn_by2 (void *__s, int __c, size_t __n)
{
register unsigned long int __d0, __d1;
register void *__tmp = __s;
__asm__ __volatile__
("movb %b0,%h0\n\t"
- "shrl $1,%2\n\t" /* may be divisible also by 4 */
- "jz 2f\n\t"
"pushw %w0\n\t"
"shll $16,%0\n\t"
"popw %w0\n"
@@ -245,10 +376,9 @@ __memset_gc_by2 (void *__s, int __c, size_t __n)
"leal 4(%1),%1\n\t"
"decl %2\n\t"
"jnz 1b\n"
- "2:\n\t"
"movw %w0,(%1)"
: "=&q" (__d0), "=&r" (__tmp), "=&r" (__d1)
- : "0" ((unsigned int) __c), "1" (__tmp), "2" (__n / 2)
+ : "0" ((unsigned int) __c), "1" (__tmp), "2" (__n / 4)
: "memory", "cc");
return __s;
}
@@ -261,7 +391,8 @@ __memset_gg (void *__s, int __c, size_t __n)
register unsigned long int __d0, __d1;
register void *__tmp = __s;
__asm__ __volatile__
- ("movb %%al,%%ah\n\t"
+ ("cld\n\t"
+ "movb %%al,%%ah\n\t"
"shrl $1,%%ecx\n\t"
"rep; stosw\n\t"
"jnc 1f\n\t"
@@ -280,18 +411,20 @@ __STRING_INLINE void *
memchr (__const void *__s, int __c, size_t __n)
{
register unsigned long int __d0;
+#ifdef __i686__
+ register unsigned long int __d1;
+#endif
register unsigned char *__res;
if (__n == 0)
return NULL;
#ifdef __i686__
__asm__ __volatile__
- ("movl $1, %%edx\n\t"
- "cld\n\t"
+ ("cld\n\t"
"repne; scasb\n\t"
- "cmovne %%edx,%0"
- : "=D" (__res), "=&c" (__d0)
- : "a" (__c), "0" (__s), "1" (__n)
- : "dx", "cc");
+ "cmovne %2,%0"
+ : "=D" (__res), "=&c" (__d0), "=&r" (__d1)
+ : "a" (__c), "0" (__s), "1" (__n), "2" (1)
+ : "cc");
#else
__asm__ __volatile__
("cld\n\t"