diff options
Diffstat (limited to 'posix')
-rw-r--r-- | posix/Makefile | 3 | ||||
-rw-r--r-- | posix/Versions | 4 | ||||
-rw-r--r-- | posix/bug-regex4.c | 1 | ||||
-rw-r--r-- | posix/bug-regex6.c | 74 | ||||
-rw-r--r-- | posix/regcomp.c | 60 | ||||
-rw-r--r-- | posix/regex_internal.h | 45 | ||||
-rw-r--r-- | posix/regexec.c | 94 |
7 files changed, 195 insertions, 86 deletions
diff --git a/posix/Makefile b/posix/Makefile index 008154de9a..db58cf5c7c 100644 --- a/posix/Makefile +++ b/posix/Makefile @@ -71,7 +71,7 @@ tests := tstgetopt testfnm runtests runptests \ tst-getlogin tst-mmap tst-getaddrinfo tst-truncate \ tst-truncate64 tst-fork tst-fnmatch tst-regexloc tst-dir \ tst-chmod bug-regex1 bug-regex2 bug-regex3 bug-regex4 \ - tst-gnuglob tst-regex bug-regex5 + tst-gnuglob tst-regex bug-regex5 bug-regex6 ifeq (yes,$(build-shared)) test-srcs := globtest tests += wordexp-test tst-exec tst-spawn @@ -125,6 +125,7 @@ tst-regexloc-ENV = LOCPATH=$(common-objpfx)localedata bug-regex1-ENV = LOCPATH=$(common-objpfx)localedata tst-regex-ENV = LOCPATH=$(common-objpfx)localedata bug-regex5-ENV = LOCPATH=$(common-objpfx)localedata +bug-regex6-ENV = LOCPATH=$(common-objpfx)localedata testcases.h: TESTS TESTS2C.sed sed -f TESTS2C.sed < $< > $@T diff --git a/posix/Versions b/posix/Versions index c79a042aa3..07dc49a4d4 100644 --- a/posix/Versions +++ b/posix/Versions @@ -105,6 +105,10 @@ libc { # Extended Interface. fnmatch; } + GLIBC_2.2.6 { + # For syscall wrapper + __nanosleep; + } GLIBC_PRIVATE { # functions which have an additional interface since they are # are cancelable. diff --git a/posix/bug-regex4.c b/posix/bug-regex4.c index 76841e9db3..5ecca1a8d0 100644 --- a/posix/bug-regex4.c +++ b/posix/bug-regex4.c @@ -35,7 +35,6 @@ main (void) setlocale (LC_ALL, "C"); - setlocale (LC_ALL, "C"); s = re_compile_pattern ("ab[cde]", 7, ®ex); if (s != NULL) { diff --git a/posix/bug-regex6.c b/posix/bug-regex6.c new file mode 100644 index 0000000000..9a06898a26 --- /dev/null +++ b/posix/bug-regex6.c @@ -0,0 +1,74 @@ +/* Test for regexec. + Copyright (C) 2002 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Jakub Jelinek <jakub@redhat.com>, 2002. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sys/types.h> +#include <regex.h> +#include <locale.h> +#include <stdio.h> + +int +main (int argc, char *argv[]) +{ + regex_t re; + regmatch_t mat[10]; + int i, j, ret = 0; + char *locales[] = { "C", "de_DE.UTF-8" }; + char *string = "http://www.regex.com/pattern/matching.html#intro"; + regmatch_t expect[10] = { + { 0, 48 }, { 0, 5 }, { 0, 4 }, { 5, 20 }, { 7, 20 }, { 20, 42 }, + { -1, -1 }, { -1, -1 }, { 42, 48 }, { 43, 48 } }; + + for (i = 0; i < sizeof (locales) / sizeof (locales[0]); ++i) + { + if (setlocale (LC_ALL, locales[i]) == NULL) + { + puts ("cannot set locale"); + ret = 1; + } + else if (regcomp (&re, + "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?", + REG_EXTENDED) != REG_NOERROR) + { + puts ("cannot compile expression \"[a-f]*\""); + ret = 1; + } + else if (regexec (&re, string, 10, mat, 0) == REG_NOMATCH) + { + puts ("no match"); + ret = 1; + } + else + { + if (! memcmp (mat, expect, sizeof (mat))) + printf ("matching ok for %s locale\n", locales[i]); + else + { + printf ("matching failed for %s locale:\n", locales[i]); + ret = 1; + for (j = 0; j < 9; ++j) + if (mat[j].rm_so != -1) + printf ("%d: %.*s\n", j, mat[j].rm_eo - mat[j].rm_so, + string + mat[j].rm_so); + } + } + } + + return ret; +} diff --git a/posix/regcomp.c b/posix/regcomp.c index b9b056065f..513604274c 100644 --- a/posix/regcomp.c +++ b/posix/regcomp.c @@ -724,7 +724,6 @@ re_compile_internal (preg, pattern, length, syntax) dfa = re_realloc (preg->buffer, re_dfa_t, 1); if (dfa == NULL) return REG_ESPACE; - memset (dfa, '\0', sizeof (re_dfa_t)); preg->allocated = sizeof (re_dfa_t); } preg->buffer = (unsigned char *) dfa; @@ -781,6 +780,9 @@ init_dfa (dfa, pat_len) int pat_len; { int table_size; + + memset (dfa, '\0', sizeof (re_dfa_t)); + dfa->nodes_alloc = pat_len + 1; dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc); @@ -1001,8 +1003,6 @@ calc_first (dfa, node) switch (type) { #ifdef DEBUG - case OP_OPEN_SUBEXP: - case OP_CLOSE_SUBEXP: case OP_OPEN_BRACKET: case OP_CLOSE_BRACKET: case OP_OPEN_DUP_NUM: @@ -1028,6 +1028,8 @@ calc_first (dfa, node) case SIMPLE_BRACKET: case OP_BACK_REF: case ANCHOR: + case OP_OPEN_SUBEXP: + case OP_CLOSE_SUBEXP: node->first = idx; break; case OP_DUP_PLUS: @@ -1041,14 +1043,6 @@ calc_first (dfa, node) case OP_ALT: node->first = idx; break; - case SUBEXP: - if (node->left == NULL) - { - if (node->next == -1) - calc_next (dfa, node); - node->first = node->next; - break; - } /* else fall through */ default: #ifdef DEBUG @@ -1161,7 +1155,9 @@ calc_epsdest (dfa, node) } re_node_set_init_2 (dfa->edests + idx, left, right); } - else if (dfa->nodes[idx].type == ANCHOR) + else if (dfa->nodes[idx].type == ANCHOR + || dfa->nodes[idx].type == OP_OPEN_SUBEXP + || dfa->nodes[idx].type == OP_CLOSE_SUBEXP) re_node_set_init_1 (dfa->edests + idx, node->next); } } @@ -2055,8 +2051,9 @@ parse_sub_exp (regexp, preg, token, syntax, nest, err) reg_errcode_t *err; { re_dfa_t *dfa = (re_dfa_t *) preg->buffer; - bin_tree_t *tree; + bin_tree_t *tree, *left_par, *right_par; size_t cur_nsub; + int new_idx; cur_nsub = preg->re_nsub++; if (dfa->subexps_alloc < preg->re_nsub) { @@ -2073,30 +2070,39 @@ parse_sub_exp (regexp, preg, token, syntax, nest, err) } dfa->subexps[cur_nsub].start = dfa->nodes_len; dfa->subexps[cur_nsub].end = -1; + + new_idx = re_dfa_add_node (dfa, *token, 0); + left_par = create_tree (NULL, NULL, 0, new_idx); + if (BE (new_idx == -1 || left_par == NULL, 0)) + return *err = REG_ESPACE, NULL; + dfa->nodes[new_idx].opr.idx = cur_nsub; *token = fetch_token (regexp, syntax); /* The subexpression may be a null string. */ if (token->type == OP_CLOSE_SUBEXP) - { - tree = create_tree (NULL, NULL, SUBEXP, 0); - if (BE (tree == NULL, 0)) - return *err = REG_ESPACE, NULL; - dfa->subexps[cur_nsub].end = dfa->nodes_len; - } + tree = NULL; else { tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); if (BE (*err != REG_NOERROR && tree == NULL, 0)) return NULL; - dfa->subexps[cur_nsub].end = dfa->nodes_len; - if (BE (token->type != OP_CLOSE_SUBEXP, 0)) - { - free_bin_tree (tree); - *err = REG_BADPAT; - return NULL; - } - tree = create_tree (tree, NULL, SUBEXP, 0); } + if (BE (token->type != OP_CLOSE_SUBEXP, 0)) + { + free_bin_tree (tree); + *err = REG_BADPAT; + return NULL; + } + new_idx = re_dfa_add_node (dfa, *token, 0); + dfa->subexps[cur_nsub].end = dfa->nodes_len; + right_par = create_tree (NULL, NULL, 0, new_idx); + tree = ((tree == NULL) ? right_par + : create_tree (tree, right_par, CONCAT, 0)); + tree = create_tree (left_par, tree, CONCAT, 0); + if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0)) + return *err = REG_ESPACE, NULL; + dfa->nodes[new_idx].opr.idx = cur_nsub; + return tree; } diff --git a/posix/regex_internal.h b/posix/regex_internal.h index 95ae46e297..20622540e8 100644 --- a/posix/regex_internal.h +++ b/posix/regex_internal.h @@ -96,8 +96,6 @@ typedef enum NON_TYPE = 0, /* Token type, these are used only by token. */ - OP_OPEN_SUBEXP, - OP_CLOSE_SUBEXP, OP_OPEN_BRACKET, OP_CLOSE_BRACKET, OP_CHARSET_RANGE, @@ -124,6 +122,8 @@ typedef enum #endif /* RE_ENABLE_I18N */ /* Node type, These are used by token, node, tree. */ + OP_OPEN_SUBEXP, + OP_CLOSE_SUBEXP, OP_PERIOD, CHARACTER, END_OF_RE, @@ -142,24 +142,18 @@ typedef enum #ifdef RE_ENABLE_I18N typedef struct { - /* If this character set is the non-matching list. */ - unsigned int non_match : 1; - /* Multibyte characters. */ wchar_t *mbchars; - int nmbchars; /* Collating symbols. */ # ifdef _LIBC int32_t *coll_syms; # endif - int ncoll_syms; /* Equivalence classes. */ # ifdef _LIBC int32_t *equiv_classes; # endif - int nequiv_classes; /* Range expressions. */ # ifdef _LIBC @@ -169,17 +163,32 @@ typedef struct wchar_t *range_starts; wchar_t *range_ends; # endif /* not _LIBC */ - int nranges; /* Character classes. */ wctype_t *char_classes; + + /* If this character set is the non-matching list. */ + unsigned int non_match : 1; + + /* # of multibyte characters. */ + int nmbchars; + + /* # of collating symbols. */ + int ncoll_syms; + + /* # of equivalence classes. */ + int nequiv_classes; + + /* # of range expressions. */ + int nranges; + + /* # of character classes. */ int nchar_classes; } re_charset_t; #endif /* RE_ENABLE_I18N */ typedef struct { - re_token_type_t type; union { unsigned char c; /* for CHARACTER */ @@ -195,6 +204,11 @@ typedef struct re_node_set *bkref_eclosure; } *ctx_info; } opr; +#if __GNUC__ >= 2 + re_token_type_t type : 8; +#else + re_token_type_t type; +#endif unsigned int constraint : 10; /* context constraint */ unsigned int duplicated : 1; #ifdef RE_ENABLE_I18N @@ -203,8 +217,9 @@ typedef struct } re_token_t; #define IS_EPSILON_NODE(type) \ - ((type) == OP_ALT || (type) == OP_DUP_ASTERISK || (type) == OP_DUP_PLUS || \ - (type) == OP_DUP_QUESTION || (type) == ANCHOR) + ((type) == OP_ALT || (type) == OP_DUP_ASTERISK || (type) == OP_DUP_PLUS \ + || (type) == OP_DUP_QUESTION || (type) == ANCHOR \ + || (type) == OP_OPEN_SUBEXP || (type) == OP_CLOSE_SUBEXP) #define ACCEPT_MB_NODE(type) \ ((type) == COMPLEX_BRACKET || (type) == OP_PERIOD) @@ -214,9 +229,6 @@ struct re_string_t /* Indicate the raw buffer which is the original string passed as an argument of regexec(), re_search(), etc.. */ const unsigned char *raw_mbs; - /* Index in RAW_MBS. Each character mbs[i] corresponds to - raw_mbs[raw_mbs_idx + i]. */ - int raw_mbs_idx; /* Store the multibyte string. In case of "case insensitive mode" like REG_ICASE, upper cases of the string are stored, otherwise MBS points the same address that RAW_MBS points. */ @@ -230,6 +242,9 @@ struct re_string_t wint_t *wcs; mbstate_t cur_state; #endif + /* Index in RAW_MBS. Each character mbs[i] corresponds to + raw_mbs[raw_mbs_idx + i]. */ + int raw_mbs_idx; /* The length of the valid characters in the buffers. */ int valid_len; /* The length of the buffers MBS, MBS_CASE, and WCS. */ diff --git a/posix/regexec.c b/posix/regexec.c index 2c7a2774eb..5dd3a06827 100644 --- a/posix/regexec.c +++ b/posix/regexec.c @@ -58,6 +58,8 @@ static int check_halt_node_context (const re_dfa_t *dfa, int node, static int check_halt_state_context (const regex_t *preg, const re_dfastate_t *state, const re_match_context_t *mctx, int idx); +static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch, int cur_node, + int cur_idx, int nmatch); static int proceed_next_node (const regex_t *preg, const re_match_context_t *mctx, int *pidx, int node, re_node_set *eps_via_nodes); @@ -886,24 +888,38 @@ proceed_next_node (preg, mctx, pidx, node, eps_via_nodes) re_node_set *eps_via_nodes; { re_dfa_t *dfa = (re_dfa_t *)preg->buffer; - int i, dest_node = -1, err; + int i, err, dest_node, cur_entity; + dest_node = -1; + cur_entity = ((dfa->nodes[node].type == OP_CONTEXT_NODE) + ? dfa->nodes[node].opr.ctx_info->entity : node); if (IS_EPSILON_NODE (dfa->nodes[node].type)) { + int dest_entity = INT_MAX; err = re_node_set_insert (eps_via_nodes, node); if (BE (err < 0, 0)) return -1; for (i = 0; i < mctx->state_log[*pidx]->nodes.nelem; ++i) { - int candidate = mctx->state_log[*pidx]->nodes.elems[i]; - if (!re_node_set_contains (dfa->edests + node, candidate) - && !(dfa->nodes[candidate].type == OP_CONTEXT_NODE - && re_node_set_contains (dfa->edests + node, - dfa->nodes[candidate].opr.ctx_info->entity))) - continue; - dest_node = candidate; + int candidate, candidate_entity; + candidate = mctx->state_log[*pidx]->nodes.elems[i]; + candidate_entity = ((dfa->nodes[candidate].type == OP_CONTEXT_NODE) + ? dfa->nodes[candidate].opr.ctx_info->entity + : candidate); + if (!re_node_set_contains (dfa->edests + node, candidate)) + if (candidate == candidate_entity + || !re_node_set_contains (dfa->edests + node, candidate_entity)) + continue; + /* In order to avoid infinite loop like "(a*)*". */ - if (!re_node_set_contains (eps_via_nodes, dest_node)) - break; + if (cur_entity > candidate_entity + && re_node_set_contains (eps_via_nodes, candidate)) + continue; + + if (dest_entity > candidate_entity) + { + dest_node = candidate; + dest_entity = candidate_entity; + } } #ifdef DEBUG assert (dest_node != -1); @@ -986,9 +1002,8 @@ set_regs (preg, mctx, nmatch, pmatch, last_node) int last_node; { re_dfa_t *dfa = (re_dfa_t *)preg->buffer; - int idx, cur_node, node_entity, real_nmatch; + int idx, cur_node, real_nmatch; re_node_set eps_via_nodes; - int i; #ifdef DEBUG assert (nmatch > 1); assert (mctx->state_log != NULL); @@ -998,36 +1013,7 @@ set_regs (preg, mctx, nmatch, pmatch, last_node) re_node_set_init_empty (&eps_via_nodes); for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;) { - node_entity = ((dfa->nodes[cur_node].type == OP_CONTEXT_NODE) - ? dfa->nodes[cur_node].opr.ctx_info->entity : cur_node); - for (i = 1; i < real_nmatch; ++i) - { - if (dfa->subexps[i - 1].start == dfa->subexps[i - 1].end) - { - /* In case of the null subexpression like '()'. */ - if (dfa->subexps[i - 1].start == node_entity) - { - pmatch[i].rm_so = idx; - pmatch[i].rm_eo = idx; - } - } - else if (dfa->subexps[i - 1].start <= node_entity - && node_entity < dfa->subexps[i - 1].end) - { - if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo != -1) - /* We are at the first node of this sub expression. */ - { - pmatch[i].rm_so = idx; - pmatch[i].rm_eo = -1; - } - } - else - { - if (pmatch[i].rm_so != -1 && pmatch[i].rm_eo == -1) - /* We are at the last node of this sub expression. */ - pmatch[i].rm_eo = idx; - } - } + update_regs (dfa, pmatch, cur_node, idx, real_nmatch); if (idx == pmatch[0].rm_eo && cur_node == last_node) break; @@ -1040,6 +1026,30 @@ set_regs (preg, mctx, nmatch, pmatch, last_node) return REG_NOERROR; } +static void +update_regs (dfa, pmatch, cur_node, cur_idx, nmatch) + re_dfa_t *dfa; + regmatch_t *pmatch; + int cur_node, cur_idx, nmatch; +{ + int type = dfa->nodes[cur_node].type; + int reg_num; + if (type != OP_OPEN_SUBEXP && type != OP_CLOSE_SUBEXP) + return; + reg_num = dfa->nodes[cur_node].opr.idx + 1; + if (reg_num >= nmatch) + return; + if (type == OP_OPEN_SUBEXP) + { + /* We are at the first node of this sub expression. */ + pmatch[reg_num].rm_so = cur_idx; + pmatch[reg_num].rm_eo = -1; + } + else if (type == OP_CLOSE_SUBEXP) + /* We are at the first node of this sub expression. */ + pmatch[reg_num].rm_eo = cur_idx; + } + #define NUMBER_OF_STATE 1 /* This function checks the STATE_LOG from the MCTX->match_last to 0 |