aboutsummaryrefslogtreecommitdiff
path: root/posix
diff options
context:
space:
mode:
Diffstat (limited to 'posix')
-rw-r--r--posix/bug-regex20.c22
-rw-r--r--posix/regcomp.c13
-rw-r--r--posix/regexec.c7
3 files changed, 31 insertions, 11 deletions
diff --git a/posix/bug-regex20.c b/posix/bug-regex20.c
index 11b9484faf..74662e6246 100644
--- a/posix/bug-regex20.c
+++ b/posix/bug-regex20.c
@@ -43,15 +43,35 @@ static struct
\xe2\x80\x94 EM DASH */
/* Should be optimized. */
{RE_SYNTAX_POSIX_BASIC, "foo", "b\xc3\xa4rfoob\xc3\xa4z", 4, 1},
+ {RE_SYNTAX_POSIX_BASIC, "b\xc3\xa4z", "b\xc3\xa4rfoob\xc3\xa4z", 7, 1},
+ {RE_SYNTAX_POSIX_BASIC, "b\xc3\xa4*z", "b\xc3\xa4rfoob\xc3\xa4z", 7, 1},
+ {RE_SYNTAX_POSIX_BASIC, "b\xc3\xa4*z", "b\xc3\xa4rfoobz", 7, 1},
+ {RE_SYNTAX_POSIX_BASIC, "b\xc3\xa4\\+z",
+ "b\xc3\xa4rfoob\xc3\xa4\xc3\xa4z", 7, 1},
+ {RE_SYNTAX_POSIX_BASIC, "b\xc3\xa4\\?z", "b\xc3\xa4rfoob\xc3\xa4z", 7, 1},
+ {RE_SYNTAX_POSIX_BASIC, "b\xc3\xa4\\{1,2\\}z",
+ "b\xc3\xa4rfoob\xc3\xa4z", 7, 1},
{RE_SYNTAX_POSIX_BASIC, "^x\\|xy*z$", "\xc3\xb6xyyz", 2, 1},
{RE_SYNTAX_POSIX_BASIC, "^x\\\\y\\{6\\}z\\+", "x\\yyyyyyzz\xc3\xb6", 0, 1},
{RE_SYNTAX_POSIX_BASIC, "^x\\\\y\\{2,36\\}z\\+", "x\\yzz\xc3\xb6", -1, 1},
{RE_SYNTAX_POSIX_BASIC, "^x\\\\y\\{,3\\}z\\+", "x\\yyyzz\xc3\xb6", 0, 1},
+ {RE_SYNTAX_POSIX_BASIC, "^x\\|x\xc3\xa4*z$",
+ "\xc3\xb6x\xc3\xa4\xc3\xa4z", 2, 1},
+ {RE_SYNTAX_POSIX_BASIC, "^x\\\\\xc3\x84\\{6\\}z\\+",
+ "x\\\xc3\x84\xc3\x84\xc3\x84\xc3\x84\xc3\x84\xc3\x84zz\xc3\xb6", 0, 1},
+ {RE_SYNTAX_POSIX_BASIC, "^x\\\\\xc3\x84\\{2,36\\}z\\+",
+ "x\\\xc3\x84zz\xc3\xb6", -1, 1},
+ {RE_SYNTAX_POSIX_BASIC, "^x\\\\\xc3\x84\\{,3\\}z\\+",
+ "x\\\xc3\x84\xc3\x84\xc3\x84zz\xc3\xb6", 0, 1},
{RE_SYNTAX_POSIX_BASIC, "x[C]y", "axCy", 1, 1},
{RE_SYNTAX_POSIX_BASIC, "x[ABC]y", "axCy", 1, 1},
{RE_SYNTAX_POSIX_BASIC, "\\`x\\|z\\'", "x\xe2\x80\x94", 0, 1},
{RE_SYNTAX_POSIX_BASIC, "\\(xy\\)z\\1a\\1", "\xe2\x80\x94xyzxyaxy\xc3\x84", 3, 1},
{RE_SYNTAX_POSIX_BASIC, "xy\\?z", "\xc3\x84xz\xc3\xb6", 2, 1},
+ {RE_SYNTAX_POSIX_BASIC, "\\`\xc3\x84\\|z\\'", "\xc3\x84\xe2\x80\x94", 0, 1},
+ {RE_SYNTAX_POSIX_BASIC, "\\(x\xc3\x84\\)z\\1\x61\\1",
+ "\xe2\x80\x94x\xc3\x84zx\xc3\x84\x61x\xc3\x84\xc3\x96", 3, 1},
+ {RE_SYNTAX_POSIX_BASIC, "x\xc3\x96\\?z", "\xc3\x84xz\xc3\xb6", 2, 1},
{RE_SYNTAX_POSIX_EXTENDED, "foo", "b\xc3\xa4rfoob\xc3\xa4z", 4, 1},
{RE_SYNTAX_POSIX_EXTENDED, "^x|xy*z$", "\xc3\xb6xyyz", 2, 1},
{RE_SYNTAX_POSIX_EXTENDED, "^x\\\\y{6}z+", "x\\yyyyyyzz\xc3\xb6", 0, 1},
@@ -64,7 +84,6 @@ static struct
{RE_SYNTAX_POSIX_EXTENDED, "xy?z", "\xc3\x84xz\xc3\xb6", 2, 1},
/* Should not be optimized. */
{RE_SYNTAX_POSIX_BASIC, "x.y", "ax\xe2\x80\x94yz", 1, 0},
- {RE_SYNTAX_POSIX_BASIC, "x\xc3\x96*y", "ax\xc3\x96\xc3\x96yz", 1, 0},
{RE_SYNTAX_POSIX_BASIC, "x[\xc3\x84\xc3\xa4]y", "ax\xc3\xa4y", 1, 0},
{RE_SYNTAX_POSIX_BASIC, "x[A-Z,]y", "axCy", 1, 0},
{RE_SYNTAX_POSIX_BASIC, "x[^y]z", "ax\xe2\x80\x94z", 1, 0},
@@ -77,7 +96,6 @@ static struct
{RE_SYNTAX_POSIX_BASIC, "a\\wz", "a\xc3\x84z", 0, 0},
{RE_SYNTAX_POSIX_BASIC, "x\\Wz", "\xc3\x96x\xe2\x80\x94z", 2, 0},
{RE_SYNTAX_POSIX_EXTENDED, "x.y", "ax\xe2\x80\x94yz", 1, 0},
- {RE_SYNTAX_POSIX_EXTENDED, "x\xc3\x96*y", "ax\xc3\x96\xc3\x96yz", 1, 0},
{RE_SYNTAX_POSIX_EXTENDED, "x[\xc3\x84\xc3\xa4]y", "ax\xc3\xa4y", 1, 0},
{RE_SYNTAX_POSIX_EXTENDED, "x[A-Z,]y", "axCy", 1, 0},
{RE_SYNTAX_POSIX_EXTENDED, "x[^y]z", "ax\xe2\x80\x94z", 1, 0},
diff --git a/posix/regcomp.c b/posix/regcomp.c
index 68ce551c3a..b5f0c92a3a 100644
--- a/posix/regcomp.c
+++ b/posix/regcomp.c
@@ -965,17 +965,14 @@ static void
optimize_utf8 (dfa)
re_dfa_t *dfa;
{
- int node, i;
+ int node, i, mb_chars = 0;
for (node = 0; node < dfa->nodes_len; ++node)
switch (dfa->nodes[node].type)
{
case CHARACTER:
- /* Chars >= 0x80 are optimizable in some cases (e.g. when not
- followed by DUP operator, not in bracket etc.).
- For now punt on them all. */
if (dfa->nodes[node].opr.c >= 0x80)
- return;
+ mb_chars = 1;
break;
case ANCHOR:
switch (dfa->nodes[node].opr.idx)
@@ -1010,6 +1007,12 @@ optimize_utf8 (dfa)
return;
}
+ if (mb_chars)
+ for (node = 0; node < dfa->nodes_len; ++node)
+ if (dfa->nodes[node].type == CHARACTER
+ && dfa->nodes[node].opr.c >= 0x80)
+ dfa->nodes[node].mb_partial = 0;
+
/* The search can be in single byte locale. */
dfa->mb_cur_max = 1;
dfa->is_utf8 = 0;
diff --git a/posix/regexec.c b/posix/regexec.c
index 7470197506..09756b7691 100644
--- a/posix/regexec.c
+++ b/posix/regexec.c
@@ -3483,10 +3483,6 @@ check_node_accept_bytes (preg, node_idx, input, str_idx)
int elem_len = re_string_elem_size_at (input, str_idx);
int char_len = re_string_char_size_at (input, str_idx);
int i;
-# ifdef _LIBC
- int j;
- uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
-# endif /* _LIBC */
if (elem_len <= 1 && char_len <= 1)
return 0;
if (node->type == OP_PERIOD)
@@ -3505,6 +3501,8 @@ check_node_accept_bytes (preg, node_idx, input, str_idx)
# ifdef _LIBC
const unsigned char *pin = ((char *) re_string_get_buffer (input)
+ str_idx);
+ int j;
+ uint32_t nrules;
# endif /* _LIBC */
int match_len = 0;
wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
@@ -3529,6 +3527,7 @@ check_node_accept_bytes (preg, node_idx, input, str_idx)
}
# ifdef _LIBC
+ nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
if (nrules != 0)
{
unsigned int in_collseq = 0;