diff options
author | Andreas Schwab <schwab@redhat.com> | 2009-07-22 11:37:18 +0200 |
---|---|---|
committer | Andreas Schwab <schwab@redhat.com> | 2009-07-22 11:37:18 +0200 |
commit | 64322469ecb5746709e560f36dbc740c1300f978 (patch) | |
tree | adb81205fb9862b78ed02770ea42e96f51e89561 | |
parent | 0457885b7efb5731e67202746d476c0d023bf43f (diff) | |
parent | ae612b04cc0716186e0d14e342bee184ba94ac1b (diff) | |
download | glibc-64322469ecb5746709e560f36dbc740c1300f978.tar glibc-64322469ecb5746709e560f36dbc740c1300f978.tar.gz glibc-64322469ecb5746709e560f36dbc740c1300f978.tar.bz2 glibc-64322469ecb5746709e560f36dbc740c1300f978.zip |
Merge commit 'origin/master' into fedora/master
-rw-r--r-- | ChangeLog | 41 | ||||
-rw-r--r-- | locale/C-ctype.c | 6 | ||||
-rw-r--r-- | locale/langinfo.h | 1 | ||||
-rw-r--r-- | locale/localeinfo.h | 4 | ||||
-rw-r--r-- | locale/programs/ld-ctype.c | 27 | ||||
-rw-r--r-- | nptl/ChangeLog | 6 | ||||
-rw-r--r-- | nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S | 8 | ||||
-rw-r--r-- | nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S | 31 | ||||
-rw-r--r-- | string/strcasestr.c | 10 | ||||
-rw-r--r-- | string/strstr.c | 9 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/Makefile | 4 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strcasestr-c.c | 18 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strcasestr.c | 3 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strstr-c.c | 12 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strstr.c | 464 |
15 files changed, 612 insertions, 32 deletions
@@ -1,3 +1,44 @@ +2009-07-21 Ulrich Drepper <drepper@redhat.com> + + * sysdeps/x86_64/multiarch/strstr.c: Minor cleanups. Remove + unnecesary variables. Comment fixes. + +2009-07-20 Ulrich Drepper <drepper@redhat.com> + + * sysdeps/x86_64/multiarch/strstr.c [USE_AS_STRCASESTR] (STRSTR_SSE42): + Use NONASCII_CASE information provided by the locale to determine + whether optimized string load function can be used. Minor cleanups. + +2009-07-20 H.J. Lu <hongjiu.lu@intel.com> + + * string/strcasestr.c (STRCASESTR): New macro. + (__strcasestr): Renamed to .. + (STRCASESTR): ...this. + * string/strstr.c (STRSTR): New macro. + (strstr): Renamed to .. + (STRSTR): ...this. + * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add + strstr-c strcasestr-c + (CFLAGS-strstr.c): New. + (CFLAGS-strcasestr.c): Likewise. + * sysdeps/x86_64/multiarch/strcasestr-c.c: New file. + * sysdeps/x86_64/multiarch/strcasestr.c: New file. + * sysdeps/x86_64/multiarch/strstr-c.c: New file. + * sysdeps/x86_64/multiarch/strstr.c: New file. + +2009-07-20 Ulrich Drepper <drepper@redhat.com> + + * locale/localeinfo.h (LIMAGIC): Update value for LC_CTYPE. + * locale/langinfo.h: Define _NL_CTYPE_NONASCII_CASE. + * locale/C-ctype.c (_nl_C_LC_CTYPE): Add initializer for + _NL_CTYPE_NONASCII_CASE. + * locale/programs/ld-ctype.c (locale_ctype_t): Add nonascii_case + field. + (ctype_finish): Check whether there are any 8-bit characters outside + the range ASCII has or whether the mapping isn't the same as for + ASCII (±0x20). Set nonascii_case appropriately. + (ctype_output): Add output handler for nonascii_case. + 2009-07-17 Ulrich Drepper <drepper@redhat.com> * sysdeps/generic/sysdep.h: Define cfi_personality, cfi_lsda, diff --git a/locale/C-ctype.c b/locale/C-ctype.c index 85f3d2addb..420b08a13f 100644 --- a/locale/C-ctype.c +++ b/locale/C-ctype.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1995-2002, 2003 Free Software Foundation, Inc. +/* Copyright (C) 1995-2002, 2003, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@cygnus.com>, 1995. @@ -528,7 +528,7 @@ _nl_C_LC_CTYPE_width attribute_hidden = }; /* Number of fields with fixed meanings, starting at 0. */ -#define NR_FIXED 71 +#define NR_FIXED 72 /* Number of class fields, starting at CLASS_OFFSET. */ #define NR_CLASSES 12 /* Number of map fields, starting at MAP_OFFSET. */ @@ -667,6 +667,8 @@ const struct locale_data _nl_C_LC_CTYPE attribute_hidden = { .wstr = NULL }, /* _NL_CTYPE_MAP_TO_NONASCII */ { .word = 0 }, + /* _NL_CTYPE_NONASCII_CASE */ + { .word = 0 }, /* NR_CLASSES wctype_tables */ { .string = (const char *) _nl_C_LC_CTYPE_class_upper.header }, { .string = (const char *) _nl_C_LC_CTYPE_class_lower.header }, diff --git a/locale/langinfo.h b/locale/langinfo.h index 59017b31c8..c940c743aa 100644 --- a/locale/langinfo.h +++ b/locale/langinfo.h @@ -334,6 +334,7 @@ enum _NL_CTYPE_TRANSLIT_IGNORE_LEN, _NL_CTYPE_TRANSLIT_IGNORE, _NL_CTYPE_MAP_TO_NONASCII, + _NL_CTYPE_NONASCII_CASE, _NL_CTYPE_EXTRA_MAP_1, _NL_CTYPE_EXTRA_MAP_2, _NL_CTYPE_EXTRA_MAP_3, diff --git a/locale/localeinfo.h b/locale/localeinfo.h index 3661080bb2..19ea41ae6d 100644 --- a/locale/localeinfo.h +++ b/locale/localeinfo.h @@ -1,5 +1,5 @@ /* Declarations for internal libc locale interfaces - Copyright (C) 1995-2003, 2005, 2006, 2007, 2008 + Copyright (C) 1995-2003, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -35,6 +35,8 @@ #define LIMAGIC(category) \ (category == LC_COLLATE \ ? ((unsigned int) (0x20051014 ^ (category))) \ + : category == LC_CTYPE \ + ? ((unsigned int) (0x20090720 ^ (category))) \ : ((unsigned int) (0x20031115 ^ (category)))) /* Two special weight constants for the collation data. */ diff --git a/locale/programs/ld-ctype.c b/locale/programs/ld-ctype.c index d4474bf1a2..376a02c2f0 100644 --- a/locale/programs/ld-ctype.c +++ b/locale/programs/ld-ctype.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1995-2006, 2007 Free Software Foundation, Inc. +/* Copyright (C) 1995-2006, 2007, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@gnu.org>, 1995. @@ -181,6 +181,7 @@ struct locale_ctype_t size_t default_missing_lineno; uint32_t to_nonascii; + uint32_t nonascii_case; /* The arrays for the binary representation. */ char_class_t *ctype_b; @@ -625,6 +626,27 @@ character <SP> not defined in character map"))); else ctype->class256_collection[space_seq->bytes[0]] |= BIT (tok_print); + /* Check whether all single-byte characters make to their upper/lowercase + equivalent according to the ASCII rules. */ + for (cnt = 'A'; cnt <= 'Z'; ++cnt) + { + uint32_t uppval = ctype->map256_collection[0][cnt]; + uint32_t lowval = ctype->map256_collection[1][cnt]; + uint32_t lowuppval = ctype->map256_collection[0][lowval]; + uint32_t lowlowval = ctype->map256_collection[1][lowval]; + + if (uppval != cnt + || lowval != cnt + 0x20 + || lowuppval != cnt + || lowlowval != cnt + 0x20) + ctype->nonascii_case = 1; + } + for (cnt = 0; cnt < 256; ++cnt) + if (cnt < 'A' || (cnt > 'Z' && cnt < 'a') || cnt > 'z') + if (ctype->map256_collection[0][cnt] != cnt + || ctype->map256_collection[1][cnt] != cnt) + ctype->nonascii_case = 1; + /* Now that the tests are done make sure the name array contains all characters which are handled in the WIDTH section of the character set definition file. */ @@ -1045,6 +1067,9 @@ ctype_output (struct localedef_t *locale, const struct charmap_t *charmap, CTYPE_DATA (_NL_CTYPE_MAP_TO_NONASCII, &ctype->to_nonascii, sizeof (uint32_t)); + CTYPE_DATA (_NL_CTYPE_NONASCII_CASE, + &ctype->nonascii_case, sizeof (uint32_t)); + case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_MB_LEN): iov[2 + elem + offset].iov_base = alloca (sizeof (uint32_t)); iov[2 + elem + offset].iov_len = sizeof (uint32_t); diff --git a/nptl/ChangeLog b/nptl/ChangeLog index 3eded66512..1f24aa5849 100644 --- a/nptl/ChangeLog +++ b/nptl/ChangeLog @@ -1,3 +1,9 @@ +2009-07-20 Ulrich Drepper <drepper@redhat.com> + + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S: Minor + optimizations of last changes. + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S: Likewise. + 2009-07-19 Ulrich Drepper <drepper@redhat.com> * sysdeps/unix/sysv/linux/x86_64/lowlevellock.h: Define diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S index f81466e1a5..e12790cb96 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S @@ -160,16 +160,14 @@ __pthread_cond_timedwait: movq 8(%rsp), %rdi movq %r13, %r10 + movl $FUTEX_WAIT_BITSET, %esi cmpq $-1, dep_mutex(%rdi) - movl $FUTEX_WAIT_BITSET, %eax - movl $(FUTEX_WAIT_BITSET|FUTEX_PRIVATE_FLAG), %esi - cmove %eax, %esi je 60f movq dep_mutex(%rdi), %r8 /* Requeue to a PI mutex if the PI bit is set. */ testl $PI_BIT, MUTEX_KIND(%r8) - je 60f + je 61f movl $(FUTEX_WAIT_REQUEUE_PI|FUTEX_PRIVATE_FLAG), %esi xorl %eax, %eax @@ -191,10 +189,10 @@ __pthread_cond_timedwait: cmpq $-4095, %rax jnae 62f - movl $(FUTEX_WAIT_BITSET|FUTEX_PRIVATE_FLAG), %esi subq $cond_futex, %rdi #endif +61: movl $(FUTEX_WAIT_BITSET|FUTEX_PRIVATE_FLAG), %esi 60: xorl %r15d, %r15d xorl %eax, %eax /* The following only works like this because we only support diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S index e6323ea3e2..2fab38e277 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S @@ -128,28 +128,15 @@ __pthread_cond_wait: movq 8(%rsp), %rdi xorq %r10, %r10 movq %r12, %rdx - // XXX reverse + lea - addq $cond_futex, %rdi - cmpq $-1, dep_mutex-cond_futex(%rdi) -#ifdef __ASSUME_PRIVATE_FUTEX - movl $FUTEX_WAIT, %eax - movl $(FUTEX_WAIT|FUTEX_PRIVATE_FLAG), %esi - cmove %eax, %esi -#else - movl $0, %eax - movl %fs:PRIVATE_FUTEX, %esi - cmove %eax, %esi -# if FUTEX_WAIT != 0 -# error "cc destroyed by following orl" - orl $FUTEX_WAIT, %esi -# endif -#endif + cmpq $-1, dep_mutex(%rdi) + leaq cond_futex(%rdi), %rdi + movl $FUTEX_WAIT, %esi je 60f movq dep_mutex-cond_futex(%rdi), %r8 /* Requeue to a PI mutex if the PI bit is set. */ testl $PI_BIT, MUTEX_KIND(%r8) - je 60f + je 61f movl $(FUTEX_WAIT_REQUEUE_PI|FUTEX_PRIVATE_FLAG), %esi movl $SYS_futex, %eax @@ -162,9 +149,17 @@ __pthread_cond_wait: cmpq $-4095, %rax jnae 62f - movl $(FUTEX_WAIT|FUTEX_PRIVATE_FLAG), %esi +# ifndef __ASSUME_PRIVATE_FUTEX + movl $FUTEX_WAIT, %esi +# endif #endif +61: +#ifdef __ASSUME_PRIVATE_FUTEX + movl $(FUTEX_WAIT|FUTEX_PRIVATE_FLAG), %esi +#else + orl %fs:PRIVATE_FUTEX, %esi +#endif 60: xorl %r13d, %r13d movl $SYS_futex, %eax syscall diff --git a/string/strcasestr.c b/string/strcasestr.c index 92f2eac7c8..088b5d91c7 100644 --- a/string/strcasestr.c +++ b/string/strcasestr.c @@ -1,5 +1,6 @@ /* Return the offset of one string within another. - Copyright (C) 1994, 1996-2000, 2004, 2008 Free Software Foundation, Inc. + Copyright (C) 1994, 1996-2000, 2004, 2008, 2009 + Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -52,11 +53,16 @@ #undef strcasestr #undef __strcasestr +#ifndef STRCASESTR +#define STRCASESTR __strcasestr +#endif + + /* Find the first occurrence of NEEDLE in HAYSTACK, using case-insensitive comparison. This function gives unspecified results in multibyte locales. */ char * -__strcasestr (const char *haystack_start, const char *needle_start) +STRCASESTR (const char *haystack_start, const char *needle_start) { const char *haystack = haystack_start; const char *needle = needle_start; diff --git a/string/strstr.c b/string/strstr.c index a9dc312992..ef45f82758 100644 --- a/string/strstr.c +++ b/string/strstr.c @@ -1,5 +1,6 @@ /* Return the offset of one string within another. - Copyright (C) 1994,1996,1997,2000,2001,2003,2008 Free Software Foundation, Inc. + Copyright (C) 1994,1996,1997,2000,2001,2003,2008,2009 + Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -40,11 +41,15 @@ #undef strstr +#ifndef STRSTR +#define STRSTR strstr +#endif + /* Return the first occurrence of NEEDLE in HAYSTACK. Return HAYSTACK if NEEDLE is empty, otherwise NULL if NEEDLE is not found in HAYSTACK. */ char * -strstr (const char *haystack_start, const char *needle_start) +STRSTR (const char *haystack_start, const char *needle_start) { const char *haystack = haystack_start; const char *needle = needle_start; diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 71e85f0652..5ce14aad8d 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -6,9 +6,11 @@ endif ifeq ($(subdir),string) sysdep_routines += stpncpy-c strncpy-c strncmp-c ifeq (yes,$(config-cflags-sse4)) -sysdep_routines += strcspn-c strpbrk-c strspn-c +sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c CFLAGS-strcspn-c.c += -msse4 CFLAGS-strpbrk-c.c += -msse4 CFLAGS-strspn-c.c += -msse4 +CFLAGS-strstr.c += -msse4 +CFLAGS-strcasestr.c += -msse4 endif endif diff --git a/sysdeps/x86_64/multiarch/strcasestr-c.c b/sysdeps/x86_64/multiarch/strcasestr-c.c new file mode 100644 index 0000000000..e6879531bc --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcasestr-c.c @@ -0,0 +1,18 @@ +#include "init-arch.h" + +#define STRCASESTR __strcasestr_sse2 +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strcasestr_sse2, __GI_strcasestr, __strcasestr_sse2); + +#include "string/strcasestr.c" + +extern char *__strcasestr_sse42 (const char *, const char *); + +#if 1 +libc_ifunc (__strcasestr, + HAS_SSE4_2 ? __strcasestr_sse42 : __strcasestr_sse2); +#else +libc_ifunc (__strcasestr, + 0 ? __strcasestr_sse42 : __strcasestr_sse2); +#endif diff --git a/sysdeps/x86_64/multiarch/strcasestr.c b/sysdeps/x86_64/multiarch/strcasestr.c new file mode 100644 index 0000000000..064e3ef4fd --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcasestr.c @@ -0,0 +1,3 @@ +#define USE_AS_STRCASESTR +#define STRSTR_SSE42 __strcasestr_sse42 +#include "strstr.c" diff --git a/sysdeps/x86_64/multiarch/strstr-c.c b/sysdeps/x86_64/multiarch/strstr-c.c new file mode 100644 index 0000000000..cff99b71ef --- /dev/null +++ b/sysdeps/x86_64/multiarch/strstr-c.c @@ -0,0 +1,12 @@ +#include "init-arch.h" + +#define STRSTR __strstr_sse2 +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2); + +#include "string/strstr.c" + +extern char *__strstr_sse42 (const char *, const char *); + +libc_ifunc (strstr, HAS_SSE4_2 ? __strstr_sse42 : __strstr_sse2); diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c new file mode 100644 index 0000000000..76d5ad16df --- /dev/null +++ b/sysdeps/x86_64/multiarch/strstr.c @@ -0,0 +1,464 @@ +/* strstr with SSE4.2 intrinsics + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <nmmintrin.h> + +#ifndef STRSTR_SSE42 +# define STRSTR_SSE42 __strstr_sse42 +#endif + +#ifdef USE_AS_STRCASESTR +# include <ctype.h> +# include <locale/localeinfo.h> + +# define LOADBYTE(C) tolower (C) +# define CMPBYTE(C1, C2) (tolower (C1) == tolower (C2)) +#else +# define LOADBYTE(C) (C) +# define CMPBYTE(C1, C2) ((C1) == (C2)) +#endif + +/* We use 0xe ordered-compare: + _SIDD_SBYTE_OPS + | _SIDD_CMP_EQUAL_ORDER + | _SIDD_LEAST_SIGNIFICANT + on pcmpistri to do the scanning and string comparsion requirements of + sub-string match. In the scanning phase, we process Cflag and ECX + index to locate the first fragment match; once the first fragment + match position has been identified, we do comparison of subsequent + string fragments until we can conclude false or true match; whe + n concluding a false match, we may need to repeat scanning process + from next relevant offset in the target string. + + In the scanning phase we have 4 cases: + case ECX CFlag ZFlag SFlag + 1 16 0 0 0 + 2a 16 0 0 1 + 2b 16 0 1 0 + 2c 16 0 1 1 + + 1. No ordered-comparison match, both 16B fragments are valid, so + continue to next fragment. + 2. No ordered-comparison match, there is EOS in either fragment, + 2a. Zflg = 0, Sflg = 1, we continue + 2b. Zflg = 1, Sflg = 0, we conclude no match and return. + 2c. Zflg = 1, sflg = 1, lenth determine match or no match + + In the string comparison phase, the 1st fragment match is fixed up + to produce ECX = 0. Subsequent fragment compare of nonzero index + and no match conclude a false match. + + case ECX CFlag ZFlag SFlag + 3 X 1 0 0/1 + 4a 0 1 0 0 + 4b 0 1 0 1 + 4c 0 < X 1 0 0/1 + 5 16 0 1 0 + + 3. An initial ordered-comparison fragment match, we fix up to do + subsequent string comparison + 4a. Continuation of fragment comparison of a string compare. + 4b. EOS reached in the reference string, we conclude true match and + return + 4c. String compare failed if index is nonzero, we need to go back to + scanning + 5. failed string compare, go back to scanning + */ + +/* Fix-up of removal of unneeded data due to 16B aligned load + parameters: + value: 16B data loaded from 16B aligned address. + offset: Offset of target data address relative to 16B aligned load + address. + */ + +static __inline__ __m128i +__m128i_shift_right (__m128i value, int offset) +{ + switch (offset) + { + case 1: + value = _mm_srli_si128 (value, 1); + break; + case 2: + value = _mm_srli_si128 (value, 2); + break; + case 3: + value = _mm_srli_si128 (value, 3); + break; + case 4: + value = _mm_srli_si128 (value, 4); + break; + case 5: + value = _mm_srli_si128 (value, 5); + break; + case 6: + value = _mm_srli_si128 (value, 6); + break; + case 7: + value = _mm_srli_si128 (value, 7); + break; + case 8: + value = _mm_srli_si128 (value, 8); + break; + case 9: + value = _mm_srli_si128 (value, 9); + break; + case 10: + value = _mm_srli_si128 (value, 10); + break; + case 11: + value = _mm_srli_si128 (value, 11); + break; + case 12: + value = _mm_srli_si128 (value, 12); + break; + case 13: + value = _mm_srli_si128 (value, 13); + break; + case 14: + value = _mm_srli_si128 (value, 14); + break; + case 15: + value = _mm_srli_si128 (value, 15); + break; + } + return value; +} + +/* Simple replacement of movdqu to address 4KB boundary cross issue. + If EOS occurs within less than 16B before 4KB boundary, we don't + cross to next page. */ + +static __m128i +__attribute__ ((section (".text.sse4.2"))) +__m128i_strloadu (const unsigned char * p) +{ + int offset = ((size_t) p & (16 - 1)); + + if (offset && (int) ((size_t) p & 0xfff) > 0xff0) + { + __m128i a = _mm_load_si128 ((__m128i *) (p - offset)); + __m128i zero = _mm_setzero_si128 (); + int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero)); + if ((bmsk >> offset) != 0) + return __m128i_shift_right (a, offset); + } + return _mm_loadu_si128 ((__m128i *) p); +} + +#ifdef USE_AS_STRCASESTR + +/* Similar to __m128i_strloadu. Convert to lower case for POSIX/C + locale. */ + +static __m128i +__attribute__ ((section (".text.sse4.2"))) +__m128i_strloadu_tolower_posix (const unsigned char * p) +{ + __m128i frag = __m128i_strloadu (p); + + /* Convert frag to lower case for POSIX/C locale. */ + __m128i rangeuc = _mm_set_epi64x (0x0, 0x5a41); + __m128i u2ldelta = _mm_set1_epi64x (0xe0e0e0e0e0e0e0e0); + __m128i mask1 = _mm_cmpistrm (rangeuc, frag, 0x44); + __m128i mask2 = _mm_blendv_epi8 (u2ldelta, frag, mask1); + mask2 = _mm_sub_epi8 (mask2, u2ldelta); + return _mm_blendv_epi8 (frag, mask2, mask1); +} + +/* Similar to __m128i_strloadu. Convert to lower case for none-POSIX/C + locale. */ + +static __m128i +__attribute__ ((section (".text.sse4.2"))) +__m128i_strloadu_tolower (const unsigned char * p) +{ + union + { + char b[16]; + __m128i x; + } u; + + for (int i = 0; i < 16; i++) + if (p[i] == 0) + { + u.b[i] = 0; + break; + } + else + u.b[i] = tolower (p[i]); + + return u.x; +} +#endif + +/* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP + algorithm) overlap for a fully populated 16B vector. + Input parameter: 1st 16Byte loaded from the reference string of a + strstr function. + We don't use KMP algorithm if reference string is less than 16B. + */ + +static int +__inline__ __attribute__ ((__always_inline__,)) +KMP16Bovrlap (__m128i s2) +{ + __m128i b = _mm_unpacklo_epi8 (s2, s2); + __m128i a = _mm_unpacklo_epi8 (b, b); + a = _mm_shuffle_epi32 (a, 0); + b = _mm_srli_si128 (s2, sizeof (char)); + int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (b, a)); + + /* _BitScanForward(&k1, bmsk); */ + int k1; + __asm ("bsfl %[bmsk], %[k1]" : [k1] "=r" (k1) : [bmsk] "r" (bmsk)); + if (!bmsk) + return 16; + else if (bmsk == 0x7fff) + return 1; + else if (!k1) + { + /* There are al least two ditinct char in s2. If byte 0 and 1 are + idential and the distinct value lies farther down, we can deduce + the next byte offset to restart full compare is least no earlier + than byte 3. */ + return 3; + } + else + { + /* Byte 1 is not degenerated to byte 0. */ + return k1 + 1; + } +} + +char * +__attribute__ ((section (".text.sse4.2"))) +STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2) +{ +#define p1 s1 + const unsigned char *p2 = s2; + + if (p2[0] == '\0') + return (char *) p1; + + if (p1[0] == '\0') + return NULL; + + /* Check if p1 length is 1 byte long. */ + if (p1[1] == '\0') + return p2[1] == '\0' && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL; + +#ifdef USE_AS_STRCASESTR + __m128i (*strloadu) (const unsigned char *); + + if (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE) == 0) + strloadu = __m128i_strloadu_tolower_posix; + else + strloadu = __m128i_strloadu_tolower; +#else +# define strloadu __m128i_strloadu +#endif + + /* p1 > 1 byte long. Load up to 16 bytes of fragment. */ + __m128i frag1 = strloadu (p1); + + __m128i frag2; + if (p2[1] != '\0') + /* p2 is > 1 byte long. */ + frag2 = strloadu (p2); + else + frag2 = _mm_insert_epi8 (_mm_setzero_si128 (), LOADBYTE (p2[0]), 0); + + /* Unsigned bytes, equal order, does frag2 has null? */ + int cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); + int cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); + int cmp = _mm_cmpistri (frag2, frag1, 0x0c); + int cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c); + if (cmp_s & cmp_c) + { + int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2, + _mm_setzero_si128 ())); + int len; + __asm ("bsfl %[bmsk], %[len]" + : [len] "=r" (len) : [bmsk] "r" (bmsk)); + p1 += cmp; + if ((len + cmp) <= 16) + return (char *) p1; + + /* Load up to 16 bytes of fragment. */ + frag1 = strloadu (p1); + cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); + cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c); + cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); + cmp = _mm_cmpistri (frag2, frag1, 0x0c); + if ((len + cmp) <= 16) + return (char *) p1 + cmp; + } + + if (cmp_s) + { + /* Adjust addr for 16B alginment in ensuing loop. */ + while (!cmp_z) + { + p1 += cmp; + /* Load up to 16 bytes of fragment. */ + frag1 = strloadu (p1); + cmp = _mm_cmpistri (frag2, frag1, 0x0c); + cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); + cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); + /* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp + once already, this time cmp will be zero and we can exit. */ + if ((!cmp) & cmp_c) + break; + } + + if (!cmp_c) + return NULL; + + /* Since s2 is less than 16 bytes, com_c is definitive + determination of full match. */ + return (char *) p1 + cmp; + } + + /* General case, s2 is at least 16 bytes or more. + First, the common case of false-match at first byte of p2. */ + const unsigned char *pt = NULL; + int kmp_fwd = 0; +re_trace: + while (!cmp_c) + { + /* frag1 has null. */ + if (cmp_z) + return NULL; + + /* frag 1 has no null, advance 16 bytes. */ + p1 += 16; + /* Load up to 16 bytes of fragment. */ + frag1 = strloadu (p1); + /* Unsigned bytes, equal order, is there a partial match? */ + cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); + cmp = _mm_cmpistri (frag2, frag1, 0x0c); + cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); + } + + /* Next, handle initial positive match as first byte of p2. We have + a partial fragment match, make full determination until we reached + end of s2. */ + if (!cmp) + { + if (cmp_z) + return (char *) p1; + + pt = p1; + p1 += 16; + p2 += 16; + /* Load up to 16 bytes of fragment. */ + frag2 = strloadu (p2); + } + else + { + /* Adjust 16B alignment. */ + p1 += cmp; + pt = p1; + } + + /* Load up to 16 bytes of fragment. */ + frag1 = strloadu (p1); + + /* Unsigned bytes, equal order, does frag2 has null? */ + cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); + cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); + cmp = _mm_cmpistri (frag2, frag1, 0x0c); + cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c); + while (!(cmp | cmp_z | cmp_s)) + { + p1 += 16; + p2 += 16; + /* Load up to 16 bytes of fragment. */ + frag2 = strloadu (p2); + /* Load up to 16 bytes of fragment. */ + frag1 = strloadu (p1); + /* Unsigned bytes, equal order, does frag2 has null? */ + cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); + cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); + cmp = _mm_cmpistri (frag2, frag1, 0x0c); + cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c); + } + + /* Full determination yielded a false result, retrace s1 to next + starting position. + Zflg 1 0 1 0/1 + Sflg 0 1 1 0/1 + cmp na 0 0 >0 + action done done continue continue if s2 < s1 + false match retrace s1 else false + */ + + if (cmp_s & !cmp) + return (char *) pt; + if (cmp_z) + { + if (!cmp_s) + return NULL; + + /* Handle both zero and sign flag set and s1 is shorter in + length. */ + __m128i zero = _mm_setzero_si128 (); + int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2)); + int bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1)); + int len; + int len1; + __asm ("bsfl %[bmsk], %[len]" + : [len] "=r" (len) : [bmsk] "r" (bmsk)); + __asm ("bsfl %[bmsk1], %[len1]" + : [len1] "=r" (len1) : [bmsk1] "r" (bmsk1)); + if (len >= len1) + return NULL; + } + else if (!cmp) + return (char *) pt; + + /* Otherwise, we have to retrace and continue. Default of multiple + paths that need to retrace from next byte in s1. */ + p2 = s2; + frag2 = strloadu (p2); + + if (!kmp_fwd) + kmp_fwd = KMP16Bovrlap (frag2); + + /* KMP algorithm predicted overlap needs to be corrected for + partial fragment compare. */ + p1 = pt + (kmp_fwd > cmp ? cmp : kmp_fwd); + + /* Since s2 is at least 16 bytes long, we're certain there is no + match. */ + if (p1[0] == '\0') + return NULL; + + /* Load up to 16 bytes of fragment. */ + frag1 = strloadu (p1); + + /* Unsigned bytes, equal order, is there a partial match? */ + cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); + cmp = _mm_cmpistri (frag2, frag1, 0x0c); + cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); + goto re_trace; +} |