From 97fd3a3003b9eb980395417ffb104e02bf315fe8 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 26 Nov 2003 03:24:15 +0000
Subject: Update.

2003-11-25  Ulrich Drepper  <drepper@redhat.com>

	* posix/runptests.c (main): Make errors fatal.
	* posix/PTESTS: One test in GA135 and GA136 check functionality
	which seems not guaranteed.

2003-11-25  Jakub Jelinek  <jakub@redhat.com>

	* posix/regexec.c (re_search_internal): If prune_impossible_nodes
	returned REG_NOMATCH, set match_last to -1.  Don't initialize
	pmatch[0] needlessly.  Fix comment.
	(prune_impossible_nodes): Don't segfault on NULL state_log entry.
	(set_regs): Fix comment.
	* posix/regcomp.c (parse_bracket_exp): Only set has_plural_match
	if adding both SIMPLE_BRACKET and COMPLEX_BRACKET.
	(build_charclass_op): Set has_plural_match if adding both
	SIMPLE_BRACKET and COMPLEX_BRACKET.
	* posix/bug-regex11.c (tests): Fix register values for one commented
	out test.  Add new tests.

	* posix/regex_internal.c (re_string_allocate): Make sure init_len
	is at least dfa->mb_cur_max.
	(re_string_reconstruct): If is_utf8, don't fall back into
	re_string_skip_chars just because idx points into a middle of
	valid UTF-8 character.  Instead, set the wcs bytes which correspond
	to the partial character bytes to WEOF.
	* posix/regexec.c (re_search_internal): Allocate input.bufs_len + 1
	instead of dfa->nodes_len + 1 state_log entries initially.
	* posix/bug-regex20.c (main): Uncomment backwards case insensitive
	tests.
---
 posix/PTESTS           | 13 ++++++++++---
 posix/bug-regex11.c    | 11 ++++++++++-
 posix/bug-regex20.c    |  3 +--
 posix/ptestcases.h     | 13 ++++++++++---
 posix/regcomp.c        |  3 ++-
 posix/regex_internal.c | 25 +++++++++++++++----------
 posix/regexec.c        | 13 ++++++++-----
 posix/runptests.c      |  5 +----
 8 files changed, 57 insertions(+), 29 deletions(-)

(limited to 'posix')

diff --git a/posix/PTESTS b/posix/PTESTS
index 8732a2ccfe..02b357cf2e 100644
--- a/posix/PTESTS
+++ b/posix/PTESTS
@@ -226,11 +226,18 @@
 1¦20¦a\(.*b\)c¦axcaxbbbcsxbbbbbbbbc¦
 # GA135
 1¦7¦\(a\(b\(c\(d\(e\)\)\)\)\)\4¦abcdededede¦
-1¦2¦a\(b\)*c\1¦acb¦
+#W POSIX does not really specify whether a\(b\)*c\1 matches acb.
+#W back references are supposed to expand to the last match, but what
+#W if there never was a match as in this case?
+-1¦-1¦a\(b\)*c\1¦acb¦
 1¦11¦\(a\(b\(c\(d\(e\(f\(g\)h\(i\(j\)\)\)\)\)\)\)\)\9¦abcdefghijjk¦
 # GA136
-1¦2¦a\(b\)*c\1¦acb¦
-4¦7¦a\(b\(c\(d\(f\)*\)\)\)\4¦xYzabcdePQRST¦
+#W These two tests have the same problem as the test in GA135.  No match
+#W of a subexpression, why should the back reference be usable?
+#W 1 2 a\(b\)*c\1 acb
+#W 4 7 a\(b\(c\(d\(f\)*\)\)\)\4¦xYzabcdePQRST
+-1¦-1¦a\(b\)*c\1¦acb¦
+-1¦-1¦a\(b\(c\(d\(f\)*\)\)\)\4¦xYzabcdePQRST¦
 # GA137
 -2¦-2¦\(a\(b\)\)\3¦foo¦
 -2¦-2¦\(a\(b\)\)\(a\(b\)\)\5¦foo¦
diff --git a/posix/bug-regex11.c b/posix/bug-regex11.c
index 7c7ef52e73..a9c319e2d6 100644
--- a/posix/bug-regex11.c
+++ b/posix/bug-regex11.c
@@ -54,13 +54,22 @@ struct
   { "(^|foo)bar", "(^|foo)bar", 0, 2, { { 0, 10 }, { -1, -1 } } },
   { "(foo|^)bar", "(foo|^)bar", 0, 2, { { 0, 10 }, { -1, -1 } } },
   /* More tests on backreferences.  */
+  { "()\\1", "x", REG_EXTENDED, 2, { { 0, 0 }, { 0, 0 } } },
+  { "()x\\1", "x", REG_EXTENDED, 2, { { 0, 1 }, { 0, 0 } } },
   { "()\\1*\\1*", "", REG_EXTENDED, 2, { { 0, 0 }, { 0, 0 } } },
   { "([0-9]).*\\1(a*)", "7;7a6", REG_EXTENDED, 3, { { 0, 4 }, { 0, 1 }, { 3, 4 } } },
   { "([0-9]).*\\1(a*)", "7;7a", REG_EXTENDED, 3, { { 0, 4 }, { 0, 1 }, { 3, 4 } } },
+  { "(b)()c\\1", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 1 }, { 1, 1 } } },
+  { "()(b)c\\2", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 0 }, { 0, 1 } } },
+  { "a(b)()c\\1", "abcb", REG_EXTENDED, 3, { { 0, 4 }, { 1, 2 }, { 2, 2 } } },
+  { "a()(b)c\\2", "abcb", REG_EXTENDED, 3, { { 0, 4 }, { 1, 1 }, { 1, 2 } } },
 #if 0
   /* XXX Not used since they fail so far.  */
-  { "()(b)\\1c\\2", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 0 }, { 1, 2 } } },
+  { "()(b)\\1c\\2", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 0 }, { 0, 1 } } },
   { "(b())\\2\\1", "bbbb", REG_EXTENDED, 3, { { 0, 2 }, { 0, 1 }, { 1, 1 } } },
+  { "a()(b)\\1c\\2", "abcb", REG_EXTENDED, 3, { { 0, 4 }, { 1, 1 }, { 1, 2 } } },
+  { "a()d(b)\\1c\\2", "adbcb", REG_EXTENDED, 3, { { 0, 5 }, { 1, 1 }, { 2, 3 } } },
+  { "a(b())\\2\\1", "abbbb", REG_EXTENDED, 3, { { 0, 3 }, { 1, 2 }, { 2, 2 } } },
   { "(bb())\\2\\1", "bbbb", REG_EXTENDED, 3, { { 0, 4 }, { 0, 2 }, { 2, 2 } } },
   { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$",
     "level", REG_NOSUB | REG_EXTENDED, 0, { { -1, -1 } } },
diff --git a/posix/bug-regex20.c b/posix/bug-regex20.c
index e709ef5fee..e55a06d270 100644
--- a/posix/bug-regex20.c
+++ b/posix/bug-regex20.c
@@ -271,7 +271,6 @@ main (void)
 	  continue;
 	}
 
-      /* XXX: This causes regex segfault.  Disable for now.
       res = re_search (&regbuf, tests[i].string, str_len, str_len, -str_len,
 		       NULL);
       if (res != tests[i].res)
@@ -280,7 +279,7 @@ main (void)
 	  ret = 1;
 	  regfree (&regbuf);
 	  continue;
-	}  */
+	}
       regfree (&regbuf);
     }
 
diff --git a/posix/ptestcases.h b/posix/ptestcases.h
index 2819004b50..506b1cce0f 100644
--- a/posix/ptestcases.h
+++ b/posix/ptestcases.h
@@ -221,11 +221,18 @@
   { 1, 20, "a\\(.*b\\)c", "axcaxbbbcsxbbbbbbbbc",  },
   { 0, 0, "GA135", NULL, },
   { 1, 7, "\\(a\\(b\\(c\\(d\\(e\\)\\)\\)\\)\\)\\4", "abcdededede",  },
-  { 1, 2, "a\\(b\\)*c\\1", "acb",  },
+  { 0, 0, NULL, "POSIX does not really specify whether a\\(b\\)*c\\1 matches acb." },
+  { 0, 0, NULL, "back references are supposed to expand to the last match, but what" },
+  { 0, 0, NULL, "if there never was a match as in this case?" },
+  { -1, -1, "a\\(b\\)*c\\1", "acb",  },
   { 1, 11, "\\(a\\(b\\(c\\(d\\(e\\(f\\(g\\)h\\(i\\(j\\)\\)\\)\\)\\)\\)\\)\\)\\9", "abcdefghijjk",  },
   { 0, 0, "GA136", NULL, },
-  { 1, 2, "a\\(b\\)*c\\1", "acb",  },
-  { 4, 7, "a\\(b\\(c\\(d\\(f\\)*\\)\\)\\)\\4", "xYzabcdePQRST",  },
+  { 0, 0, NULL, "These two tests have the same problem as the test in GA135.  No match" },
+  { 0, 0, NULL, "of a subexpression, why should the back reference be usable?" },
+  { 0, 0, NULL, "1 2 a\\(b\\)*c\\1 acb" },
+  { 0, 0, NULL, "4 7 a\\(b\\(c\\(d\\(f\\)*\\)\\)\\)\\4¦xYzabcdePQRST" },
+  { -1, -1, "a\\(b\\)*c\\1", "acb",  },
+  { -1, -1, "a\\(b\\(c\\(d\\(f\\)*\\)\\)\\)\\4", "xYzabcdePQRST",  },
   { 0, 0, "GA137", NULL, },
   { -2, -2, "\\(a\\(b\\)\\)\\3", "foo",  },
   { -2, -2, "\\(a\\(b\\)\\)\\(a\\(b\\)\\)\\5", "foo",  },
diff --git a/posix/regcomp.c b/posix/regcomp.c
index 52f1fa23e1..1f1c85926e 100644
--- a/posix/regcomp.c
+++ b/posix/regcomp.c
@@ -3213,7 +3213,6 @@ parse_bracket_exp (regexp, dfa, token, syntax, err)
       int sbc_idx;
       /* Build a tree for complex bracket.  */
       dfa->has_mb_node = 1;
-      dfa->has_plural_match = 1;
       for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
 	if (sbcset[sbc_idx])
 	  break;
@@ -3233,6 +3232,7 @@ parse_bracket_exp (regexp, dfa, token, syntax, err)
 	goto parse_bracket_exp_espace;
       /* Then join them by ALT node.  */
       alt_token.type = OP_ALT;
+      dfa->has_plural_match = 1;
       work_tree = re_dfa_add_tree_node (dfa, work_tree, mbc_tree, &alt_token);
       if (BE (mbc_tree != NULL, 1))
 	return work_tree;
@@ -3627,6 +3627,7 @@ build_charclass_op (dfa, trans, class_name, extra, not, err)
 	goto build_word_op_espace;
       /* Then join them by ALT node.  */
       alt_token.type = OP_ALT;
+      dfa->has_plural_match = 1;
       tree = re_dfa_add_tree_node (dfa, tree, mbc_tree, &alt_token);
       if (BE (mbc_tree != NULL, 1))
 	return tree;
diff --git a/posix/regex_internal.c b/posix/regex_internal.c
index 8b68bd62cb..f78ec79e65 100644
--- a/posix/regex_internal.c
+++ b/posix/regex_internal.c
@@ -55,7 +55,12 @@ re_string_allocate (pstr, str, len, init_len, trans, icase, dfa)
      const re_dfa_t *dfa;
 {
   reg_errcode_t ret;
-  int init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
+  int init_buf_len;
+
+  /* Ensure at least one character fits into the buffers.  */
+  if (init_len < dfa->mb_cur_max)
+    init_len = dfa->mb_cur_max;
+  init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
   re_string_construct_common (str, len, pstr, trans, icase, dfa);
   pstr->stop = pstr->len;
 
@@ -516,33 +521,33 @@ re_string_reconstruct (pstr, idx, eflags, newline)
 		  /* Special case UTF-8.  Multi-byte chars start with any
 		     byte other than 0x80 - 0xbf.  */
 		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
-		  end = raw + (pstr->valid_len > offset - pstr->mb_cur_max
-			       ? pstr->valid_len : offset - pstr->mb_cur_max);
+		  end = raw + (offset - pstr->mb_cur_max);
 		  for (p = raw + offset - 1; p >= end; --p)
 		    if ((*p & 0xc0) != 0x80)
 		      {
 			mbstate_t cur_state;
 			wchar_t wc2;
+			int mlen;
 
 			/* XXX Don't use mbrtowc, we know which conversion
 			   to use (UTF-8 -> UCS4).  */
 			memset (&cur_state, 0, sizeof (cur_state));
-			if (mbrtowc (&wc2, p, raw + offset - p, &cur_state)
-			    == raw + offset - p)
+			mlen = mbrtowc (&wc2, p, raw + pstr->len - p,
+					&cur_state) - (raw + offset - p);
+			if (mlen >= 0)
 			  {
 			    memset (&pstr->cur_state, '\0',
 				    sizeof (mbstate_t));
+			    pstr->valid_len = mlen;
 			    wc = wc2;
 			  }
 			break;
 		      }
 		}
 	      if (wc == WEOF)
-		{
-		  pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
-		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
-		    pstr->wcs[wcs_idx] = WEOF;
-		}
+		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
+	      for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
+		pstr->wcs[wcs_idx] = WEOF;
 	      if (pstr->trans && wc <= 0xff)
 		wc = pstr->trans[wc];
 	      pstr->tip_context = (IS_WIDE_WORD_CHAR (wc) ? CONTEXT_WORD
diff --git a/posix/regexec.c b/posix/regexec.c
index 58ac9c82c4..9720879722 100644
--- a/posix/regexec.c
+++ b/posix/regexec.c
@@ -620,7 +620,7 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
      multi character collating element.  */
   if (nmatch > 1 || dfa->has_mb_node)
     {
-      mctx.state_log = re_malloc (re_dfastate_t *, dfa->nodes_len + 1);
+      mctx.state_log = re_malloc (re_dfastate_t *, input.bufs_len + 1);
       if (BE (mctx.state_log == NULL, 0))
 	{
 	  err = REG_ESPACE;
@@ -766,6 +766,7 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 			break;
 		      if (BE (err != REG_NOMATCH, 0))
 			goto free_return;
+		      match_last = -1;
 		    }
 		  else
 		    break; /* We found a match.  */
@@ -785,7 +786,7 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
       int reg_idx;
 
       /* Initialize registers.  */
-      for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
+      for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 	pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 
       /* Set the points where matching start/end.  */
@@ -801,7 +802,8 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 	}
 
       /* At last, add the offset to the each registers, since we slided
-	 the buffers so that We can assume that the matching starts from 0.  */
+	 the buffers so that we could assume that the matching starts
+	 from 0.  */
       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 	if (pmatch[reg_idx].rm_so != -1)
 	  {
@@ -869,7 +871,8 @@ prune_impossible_nodes (preg, mctx)
 		  ret = REG_NOMATCH;
 		  goto free_return;
 		}
-	    } while (!mctx->state_log[match_last]->halt);
+	    } while (mctx->state_log[match_last] == NULL
+		     || !mctx->state_log[match_last]->halt);
 	  halt_node = check_halt_state_context (preg,
 						mctx->state_log[match_last],
 						mctx, match_last);
@@ -1236,7 +1239,7 @@ pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
 /* Set the positions where the subexpressions are starts/ends to registers
    PMATCH.
    Note: We assume that pmatch[0] is already set, and
-   pmatch[i].rm_so == pmatch[i].rm_eo == -1 (i > 1).  */
+   pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
 
 static reg_errcode_t
 set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
diff --git a/posix/runptests.c b/posix/runptests.c
index 9ce395be4f..4d43180e41 100644
--- a/posix/runptests.c
+++ b/posix/runptests.c
@@ -119,8 +119,5 @@ main (int argc, char *argv[])
 
   printf ("\n%Zu tests, %d errors\n", cnt, errors);
 
-  /* We should return here the error status but since some tests are known
-     to fail this would only cause the libc testsuite to fail.  */
-  //return errors != 0;
-  return 0;
+  return errors != 0;
 }
-- 
cgit v1.2.3