diff options
author | Ulrich Drepper <drepper@redhat.com> | 2004-11-08 22:49:44 +0000 |
---|---|---|
committer | Ulrich Drepper <drepper@redhat.com> | 2004-11-08 22:49:44 +0000 |
commit | e40a38b383fdbc616eb110e7cd6f780d010783cc (patch) | |
tree | 5a5a8e9acbf760879b7eefcc46414f1454199eea /posix/regcomp.c | |
parent | d2c38eb3facb84db061289f20ff8a210f91e4115 (diff) | |
download | glibc-e40a38b383fdbc616eb110e7cd6f780d010783cc.tar glibc-e40a38b383fdbc616eb110e7cd6f780d010783cc.tar.gz glibc-e40a38b383fdbc616eb110e7cd6f780d010783cc.tar.bz2 glibc-e40a38b383fdbc616eb110e7cd6f780d010783cc.zip |
Update.
2004-11-08 Ulrich Drepper <drepper@redhat.com>
* posix/regcomp.c (utf8_sb_map): Define.
(free_dfa_content): Don't free dfa->sb_char if it's a pointer to
utf8_sb_map.
(init_dfa): Use utf8_sb_map instead of initializing memory when the
encoding is UTF-8.
* posix/regcomp.c (init_dfa): Get the codeset name outside glibc as
well. Check if it is spelled UTF8 as well as UTF-8, and check
case-insensitively. Set dfa->map_notascii manually when outside
glibc.
* posix/regex_internal.c (build_wcs_upper_buffer) [!_LIBC]: Enable
optimizations based on map_notascii.
* posix/regex_internal.h [HAVE_LANGINFO_H || HAVE_LANGINFO_CODESET
|| _LIBC]: Include langinfo.h.
* posix/regex_internal.h (struct re_backref_cache_entry): Add "more"
field.
* posix/regexec.c (check_dst_limits): Hoist computation of the source
and destination bkref_idx out of the loop. Pass it to
check_dst_limits_calc_pos.
(check_dst_limits_calc_pos_1): New function, containing the recursive
loop of check_dst_limits_calc_pos; uses the "more" field of
struct re_backref_cache to control the loop.
(check_dst_limits_calc_pos): Store into "boundaries" the position
relative to lim's start and end positions. Do not accept eclosures,
accept bkref_idx instead. Call check_dst_limits_calc_pos_1 to do the
work.
(sift_states_bkref): Use the "more" field of struct re_backref_cache
to control the loop. A big "if" was turned into a continue and the
function was reindented.
(get_subexp): Use the "more" field of struct re_backref_cache
to control the loop.
(match_ctx_add_entry): Initialize the bkref_ents' "more" field.
(search_cur_bkref_entry): Return -1 if out of bounds.
* posix/regexec.c (empty_set): Remove.
(sift_states_backward): Remove cur_src variable. Move inner loop
to build_sifted_states.
(build_sifted_states): Extract from sift_states_backward. Do not
use empty_set.
(update_cur_sifted_state): Do not use empty_set. Special case
dest_nodes->nelem == 0.
Diffstat (limited to 'posix/regcomp.c')
-rw-r--r-- | posix/regcomp.c | 80 |
1 files changed, 69 insertions, 11 deletions
diff --git a/posix/regcomp.c b/posix/regcomp.c index 9b435a885e..bdd616dfbb 100644 --- a/posix/regcomp.c +++ b/posix/regcomp.c @@ -566,6 +566,23 @@ weak_alias (__regerror, regerror) #endif +#ifdef RE_ENABLE_I18N +/* This static array is used for the map to single-byte characters when + UTF-8 is used. Otherwise we would allocate memory just to initialize + it the same all the time. UTF-8 is the preferred encoding so this is + a worthwhile optimization. */ +static const bitset utf8_sb_map = +{ + /* Set the first 128 bits. */ +# if UINT_MAX == 0xffffffff + 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff +# else +# error "Add case for new unsigned int size" +# endif +}; +#endif + + static void free_dfa_content (re_dfa_t *dfa) { @@ -613,7 +630,8 @@ free_dfa_content (re_dfa_t *dfa) } re_free (dfa->state_table); #ifdef RE_ENABLE_I18N - re_free (dfa->sb_char); + if (dfa->sb_char != utf8_sb_map) + re_free (dfa->sb_char); #endif #ifdef DEBUG re_free (dfa->re_str); @@ -824,6 +842,9 @@ init_dfa (dfa, pat_len) int pat_len; { int table_size; +#ifndef _LIBC + char *codeset_name; +#endif memset (dfa, '\0', sizeof (re_dfa_t)); @@ -853,22 +874,59 @@ init_dfa (dfa, pat_len) dfa->is_utf8 = 1; dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) != 0); +#else +# ifdef HAVE_LANGINFO_CODESET + codeset_name = nl_langinfo (CODESET); +# else + codeset_name = getenv ("LC_ALL"); + if (codeset_name == NULL || codeset[0] == '\0') + codeset_name = getenv ("LC_CTYPE"); + if (codeset_name == NULL || codeset[0] == '\0') + codeset_name = getenv ("LANG"); + if (codeset_name == NULL) + codeset_name = ""; + else if (strchr (codeset_name, '.') != NULL) + codeset_name = strchr (codeset_name, '.') + 1; +# endif + + if (strcasecmp (codeset_name, "UTF-8") == 0 + || strcasecmp (codeset_name, "UTF8") == 0) + dfa->is_utf8 = 1; + + /* We check exhaustively in the loop below if this charset is a + superset of ASCII. */ + dfa->map_notascii = 0; #endif + #ifdef RE_ENABLE_I18N if (dfa->mb_cur_max > 1) { - int i, j, ch; - - dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1); - if (BE (dfa->sb_char == NULL, 0)) - return REG_ESPACE; if (dfa->is_utf8) - memset (dfa->sb_char, 255, sizeof (unsigned int) * BITSET_UINTS / 2); + dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map; else - for (i = 0, ch = 0; i < BITSET_UINTS; ++i) - for (j = 0; j < UINT_BITS; ++j, ++ch) - if (__btowc (ch) != WEOF) - dfa->sb_char[i] |= 1 << j; + { + int i, j, ch; + + dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1); + if (BE (dfa->sb_char == NULL, 0)) + return REG_ESPACE; + + /* Clear all bits by, then set those corresponding to single + byte chars. */ + bitset_empty (dfa->sb_char); + + for (i = 0, ch = 0; i < BITSET_UINTS; ++i) + for (j = 0; j < UINT_BITS; ++j, ++ch) + { + wchar_t wch = __btowc (ch); + if (wch != WEOF) + dfa->sb_char[i] |= 1 << j; +# ifndef _LIBC + if (isascii (ch) && wch != (wchar_t) ch) + dfa->map_notascii = 1; +# endif + } + } } #endif |