From 97fd3a3003b9eb980395417ffb104e02bf315fe8 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 26 Nov 2003 03:24:15 +0000 Subject: Update. 2003-11-25 Ulrich Drepper * posix/runptests.c (main): Make errors fatal. * posix/PTESTS: One test in GA135 and GA136 check functionality which seems not guaranteed. 2003-11-25 Jakub Jelinek * posix/regexec.c (re_search_internal): If prune_impossible_nodes returned REG_NOMATCH, set match_last to -1. Don't initialize pmatch[0] needlessly. Fix comment. (prune_impossible_nodes): Don't segfault on NULL state_log entry. (set_regs): Fix comment. * posix/regcomp.c (parse_bracket_exp): Only set has_plural_match if adding both SIMPLE_BRACKET and COMPLEX_BRACKET. (build_charclass_op): Set has_plural_match if adding both SIMPLE_BRACKET and COMPLEX_BRACKET. * posix/bug-regex11.c (tests): Fix register values for one commented out test. Add new tests. * posix/regex_internal.c (re_string_allocate): Make sure init_len is at least dfa->mb_cur_max. (re_string_reconstruct): If is_utf8, don't fall back into re_string_skip_chars just because idx points into a middle of valid UTF-8 character. Instead, set the wcs bytes which correspond to the partial character bytes to WEOF. * posix/regexec.c (re_search_internal): Allocate input.bufs_len + 1 instead of dfa->nodes_len + 1 state_log entries initially. * posix/bug-regex20.c (main): Uncomment backwards case insensitive tests. --- posix/PTESTS | 13 ++++++++++--- posix/bug-regex11.c | 11 ++++++++++- posix/bug-regex20.c | 3 +-- posix/ptestcases.h | 13 ++++++++++--- posix/regcomp.c | 3 ++- posix/regex_internal.c | 25 +++++++++++++++---------- posix/regexec.c | 13 ++++++++----- posix/runptests.c | 5 +---- 8 files changed, 57 insertions(+), 29 deletions(-) (limited to 'posix') diff --git a/posix/PTESTS b/posix/PTESTS index 8732a2ccfe..02b357cf2e 100644 --- a/posix/PTESTS +++ b/posix/PTESTS @@ -226,11 +226,18 @@ 1¦20¦a\(.*b\)c¦axcaxbbbcsxbbbbbbbbc¦ # GA135 1¦7¦\(a\(b\(c\(d\(e\)\)\)\)\)\4¦abcdededede¦ -1¦2¦a\(b\)*c\1¦acb¦ +#W POSIX does not really specify whether a\(b\)*c\1 matches acb. +#W back references are supposed to expand to the last match, but what +#W if there never was a match as in this case? +-1¦-1¦a\(b\)*c\1¦acb¦ 1¦11¦\(a\(b\(c\(d\(e\(f\(g\)h\(i\(j\)\)\)\)\)\)\)\)\9¦abcdefghijjk¦ # GA136 -1¦2¦a\(b\)*c\1¦acb¦ -4¦7¦a\(b\(c\(d\(f\)*\)\)\)\4¦xYzabcdePQRST¦ +#W These two tests have the same problem as the test in GA135. No match +#W of a subexpression, why should the back reference be usable? +#W 1 2 a\(b\)*c\1 acb +#W 4 7 a\(b\(c\(d\(f\)*\)\)\)\4¦xYzabcdePQRST +-1¦-1¦a\(b\)*c\1¦acb¦ +-1¦-1¦a\(b\(c\(d\(f\)*\)\)\)\4¦xYzabcdePQRST¦ # GA137 -2¦-2¦\(a\(b\)\)\3¦foo¦ -2¦-2¦\(a\(b\)\)\(a\(b\)\)\5¦foo¦ diff --git a/posix/bug-regex11.c b/posix/bug-regex11.c index 7c7ef52e73..a9c319e2d6 100644 --- a/posix/bug-regex11.c +++ b/posix/bug-regex11.c @@ -54,13 +54,22 @@ struct { "(^|foo)bar", "(^|foo)bar", 0, 2, { { 0, 10 }, { -1, -1 } } }, { "(foo|^)bar", "(foo|^)bar", 0, 2, { { 0, 10 }, { -1, -1 } } }, /* More tests on backreferences. */ + { "()\\1", "x", REG_EXTENDED, 2, { { 0, 0 }, { 0, 0 } } }, + { "()x\\1", "x", REG_EXTENDED, 2, { { 0, 1 }, { 0, 0 } } }, { "()\\1*\\1*", "", REG_EXTENDED, 2, { { 0, 0 }, { 0, 0 } } }, { "([0-9]).*\\1(a*)", "7;7a6", REG_EXTENDED, 3, { { 0, 4 }, { 0, 1 }, { 3, 4 } } }, { "([0-9]).*\\1(a*)", "7;7a", REG_EXTENDED, 3, { { 0, 4 }, { 0, 1 }, { 3, 4 } } }, + { "(b)()c\\1", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 1 }, { 1, 1 } } }, + { "()(b)c\\2", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 0 }, { 0, 1 } } }, + { "a(b)()c\\1", "abcb", REG_EXTENDED, 3, { { 0, 4 }, { 1, 2 }, { 2, 2 } } }, + { "a()(b)c\\2", "abcb", REG_EXTENDED, 3, { { 0, 4 }, { 1, 1 }, { 1, 2 } } }, #if 0 /* XXX Not used since they fail so far. */ - { "()(b)\\1c\\2", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 0 }, { 1, 2 } } }, + { "()(b)\\1c\\2", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 0 }, { 0, 1 } } }, { "(b())\\2\\1", "bbbb", REG_EXTENDED, 3, { { 0, 2 }, { 0, 1 }, { 1, 1 } } }, + { "a()(b)\\1c\\2", "abcb", REG_EXTENDED, 3, { { 0, 4 }, { 1, 1 }, { 1, 2 } } }, + { "a()d(b)\\1c\\2", "adbcb", REG_EXTENDED, 3, { { 0, 5 }, { 1, 1 }, { 2, 3 } } }, + { "a(b())\\2\\1", "abbbb", REG_EXTENDED, 3, { { 0, 3 }, { 1, 2 }, { 2, 2 } } }, { "(bb())\\2\\1", "bbbb", REG_EXTENDED, 3, { { 0, 4 }, { 0, 2 }, { 2, 2 } } }, { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$", "level", REG_NOSUB | REG_EXTENDED, 0, { { -1, -1 } } }, diff --git a/posix/bug-regex20.c b/posix/bug-regex20.c index e709ef5fee..e55a06d270 100644 --- a/posix/bug-regex20.c +++ b/posix/bug-regex20.c @@ -271,7 +271,6 @@ main (void) continue; } - /* XXX: This causes regex segfault. Disable for now. res = re_search (®buf, tests[i].string, str_len, str_len, -str_len, NULL); if (res != tests[i].res) @@ -280,7 +279,7 @@ main (void) ret = 1; regfree (®buf); continue; - } */ + } regfree (®buf); } diff --git a/posix/ptestcases.h b/posix/ptestcases.h index 2819004b50..506b1cce0f 100644 --- a/posix/ptestcases.h +++ b/posix/ptestcases.h @@ -221,11 +221,18 @@ { 1, 20, "a\\(.*b\\)c", "axcaxbbbcsxbbbbbbbbc", }, { 0, 0, "GA135", NULL, }, { 1, 7, "\\(a\\(b\\(c\\(d\\(e\\)\\)\\)\\)\\)\\4", "abcdededede", }, - { 1, 2, "a\\(b\\)*c\\1", "acb", }, + { 0, 0, NULL, "POSIX does not really specify whether a\\(b\\)*c\\1 matches acb." }, + { 0, 0, NULL, "back references are supposed to expand to the last match, but what" }, + { 0, 0, NULL, "if there never was a match as in this case?" }, + { -1, -1, "a\\(b\\)*c\\1", "acb", }, { 1, 11, "\\(a\\(b\\(c\\(d\\(e\\(f\\(g\\)h\\(i\\(j\\)\\)\\)\\)\\)\\)\\)\\)\\9", "abcdefghijjk", }, { 0, 0, "GA136", NULL, }, - { 1, 2, "a\\(b\\)*c\\1", "acb", }, - { 4, 7, "a\\(b\\(c\\(d\\(f\\)*\\)\\)\\)\\4", "xYzabcdePQRST", }, + { 0, 0, NULL, "These two tests have the same problem as the test in GA135. No match" }, + { 0, 0, NULL, "of a subexpression, why should the back reference be usable?" }, + { 0, 0, NULL, "1 2 a\\(b\\)*c\\1 acb" }, + { 0, 0, NULL, "4 7 a\\(b\\(c\\(d\\(f\\)*\\)\\)\\)\\4¦xYzabcdePQRST" }, + { -1, -1, "a\\(b\\)*c\\1", "acb", }, + { -1, -1, "a\\(b\\(c\\(d\\(f\\)*\\)\\)\\)\\4", "xYzabcdePQRST", }, { 0, 0, "GA137", NULL, }, { -2, -2, "\\(a\\(b\\)\\)\\3", "foo", }, { -2, -2, "\\(a\\(b\\)\\)\\(a\\(b\\)\\)\\5", "foo", }, diff --git a/posix/regcomp.c b/posix/regcomp.c index 52f1fa23e1..1f1c85926e 100644 --- a/posix/regcomp.c +++ b/posix/regcomp.c @@ -3213,7 +3213,6 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) int sbc_idx; /* Build a tree for complex bracket. */ dfa->has_mb_node = 1; - dfa->has_plural_match = 1; for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx) if (sbcset[sbc_idx]) break; @@ -3233,6 +3232,7 @@ parse_bracket_exp (regexp, dfa, token, syntax, err) goto parse_bracket_exp_espace; /* Then join them by ALT node. */ alt_token.type = OP_ALT; + dfa->has_plural_match = 1; work_tree = re_dfa_add_tree_node (dfa, work_tree, mbc_tree, &alt_token); if (BE (mbc_tree != NULL, 1)) return work_tree; @@ -3627,6 +3627,7 @@ build_charclass_op (dfa, trans, class_name, extra, not, err) goto build_word_op_espace; /* Then join them by ALT node. */ alt_token.type = OP_ALT; + dfa->has_plural_match = 1; tree = re_dfa_add_tree_node (dfa, tree, mbc_tree, &alt_token); if (BE (mbc_tree != NULL, 1)) return tree; diff --git a/posix/regex_internal.c b/posix/regex_internal.c index 8b68bd62cb..f78ec79e65 100644 --- a/posix/regex_internal.c +++ b/posix/regex_internal.c @@ -55,7 +55,12 @@ re_string_allocate (pstr, str, len, init_len, trans, icase, dfa) const re_dfa_t *dfa; { reg_errcode_t ret; - int init_buf_len = (len + 1 < init_len) ? len + 1: init_len; + int init_buf_len; + + /* Ensure at least one character fits into the buffers. */ + if (init_len < dfa->mb_cur_max) + init_len = dfa->mb_cur_max; + init_buf_len = (len + 1 < init_len) ? len + 1: init_len; re_string_construct_common (str, len, pstr, trans, icase, dfa); pstr->stop = pstr->len; @@ -516,33 +521,33 @@ re_string_reconstruct (pstr, idx, eflags, newline) /* Special case UTF-8. Multi-byte chars start with any byte other than 0x80 - 0xbf. */ raw = pstr->raw_mbs + pstr->raw_mbs_idx; - end = raw + (pstr->valid_len > offset - pstr->mb_cur_max - ? pstr->valid_len : offset - pstr->mb_cur_max); + end = raw + (offset - pstr->mb_cur_max); for (p = raw + offset - 1; p >= end; --p) if ((*p & 0xc0) != 0x80) { mbstate_t cur_state; wchar_t wc2; + int mlen; /* XXX Don't use mbrtowc, we know which conversion to use (UTF-8 -> UCS4). */ memset (&cur_state, 0, sizeof (cur_state)); - if (mbrtowc (&wc2, p, raw + offset - p, &cur_state) - == raw + offset - p) + mlen = mbrtowc (&wc2, p, raw + pstr->len - p, + &cur_state) - (raw + offset - p); + if (mlen >= 0) { memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); + pstr->valid_len = mlen; wc = wc2; } break; } } if (wc == WEOF) - { - pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx; - for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx) - pstr->wcs[wcs_idx] = WEOF; - } + pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx; + for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx) + pstr->wcs[wcs_idx] = WEOF; if (pstr->trans && wc <= 0xff) wc = pstr->trans[wc]; pstr->tip_context = (IS_WIDE_WORD_CHAR (wc) ? CONTEXT_WORD diff --git a/posix/regexec.c b/posix/regexec.c index 58ac9c82c4..9720879722 100644 --- a/posix/regexec.c +++ b/posix/regexec.c @@ -620,7 +620,7 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, multi character collating element. */ if (nmatch > 1 || dfa->has_mb_node) { - mctx.state_log = re_malloc (re_dfastate_t *, dfa->nodes_len + 1); + mctx.state_log = re_malloc (re_dfastate_t *, input.bufs_len + 1); if (BE (mctx.state_log == NULL, 0)) { err = REG_ESPACE; @@ -766,6 +766,7 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, break; if (BE (err != REG_NOMATCH, 0)) goto free_return; + match_last = -1; } else break; /* We found a match. */ @@ -785,7 +786,7 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, int reg_idx; /* Initialize registers. */ - for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) + for (reg_idx = 1; reg_idx < nmatch; ++reg_idx) pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1; /* Set the points where matching start/end. */ @@ -801,7 +802,8 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, } /* At last, add the offset to the each registers, since we slided - the buffers so that We can assume that the matching starts from 0. */ + the buffers so that we could assume that the matching starts + from 0. */ for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) if (pmatch[reg_idx].rm_so != -1) { @@ -869,7 +871,8 @@ prune_impossible_nodes (preg, mctx) ret = REG_NOMATCH; goto free_return; } - } while (!mctx->state_log[match_last]->halt); + } while (mctx->state_log[match_last] == NULL + || !mctx->state_log[match_last]->halt); halt_node = check_halt_state_context (preg, mctx->state_log[match_last], mctx, match_last); @@ -1236,7 +1239,7 @@ pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes) /* Set the positions where the subexpressions are starts/ends to registers PMATCH. Note: We assume that pmatch[0] is already set, and - pmatch[i].rm_so == pmatch[i].rm_eo == -1 (i > 1). */ + pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */ static reg_errcode_t set_regs (preg, mctx, nmatch, pmatch, fl_backtrack) diff --git a/posix/runptests.c b/posix/runptests.c index 9ce395be4f..4d43180e41 100644 --- a/posix/runptests.c +++ b/posix/runptests.c @@ -119,8 +119,5 @@ main (int argc, char *argv[]) printf ("\n%Zu tests, %d errors\n", cnt, errors); - /* We should return here the error status but since some tests are known - to fail this would only cause the libc testsuite to fail. */ - //return errors != 0; - return 0; + return errors != 0; } -- cgit v1.2.3