From 37369d1cefec4d1990e59fbec613d2bc66e68eba Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sat, 29 Nov 2003 06:40:52 +0000 Subject: Update. * misc/mntent_r.c (decode_name): Fix decoding of tab, add decoding of newline. * manual/sysinfo.texi (mtab): Adjust description accordingly. Reported by Andries.Brouwer@cwi.nl. --- ChangeLog | 5 + manual/sysinfo.texi | 13 +- misc/mntent_r.c | 8 +- posix/regex_internal.c | 460 ++++++++++++++++++++++++++++++++++++------------- 4 files changed, 360 insertions(+), 126 deletions(-) diff --git a/ChangeLog b/ChangeLog index a04f6895e0..8d963e4618 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,10 @@ 2003-11-28 Ulrich Drepper + * misc/mntent_r.c (decode_name): Fix decoding of tab, add decoding + of newline. + * manual/sysinfo.texi (mtab): Adjust description accordingly. + Reported by Andries.Brouwer@cwi.nl. + * sysdeps/x86_64/fpu/libm-test-ulps: Add some more minor changes to compensate other setup. diff --git a/manual/sysinfo.texi b/manual/sysinfo.texi index a310240801..0a44830359 100644 --- a/manual/sysinfo.texi +++ b/manual/sysinfo.texi @@ -673,12 +673,13 @@ filled with the information from the next entry from the file currently read. The file format used prescribes the use of spaces or tab characters to -separate the fields. This makes it harder to use name containing one of -these characters (e.g., mount points using spaces). Therefore these -characters are encoded in the files and the @code{getmntent} function -takes care of the decoding while reading the entries back in. -@code{'\040'} is used to encode a space character, @code{'\012'} to -encode a tab character and @code{'\\'} to encode a backslash. +separate the fields. This makes it harder to use name containing one +of these characters (e.g., mount points using spaces). Therefore +these characters are encoded in the files and the @code{getmntent} +function takes care of the decoding while reading the entries back in. +@code{'\040'} is used to encode a space character, @code{'\011'} to +encode a tab character, @code{'\012'} to encode a newline character, +and @code{'\\'} to encode a backslash. If there was an error or the end of the file is reached the return value is @code{NULL}. diff --git a/misc/mntent_r.c b/misc/mntent_r.c index 9bfe8756f1..3a47f61f38 100644 --- a/misc/mntent_r.c +++ b/misc/mntent_r.c @@ -84,12 +84,18 @@ decode_name (char *buf) *wp++ = ' '; rp += 3; } - else if (rp[0] == '\\' && rp[1] == '0' && rp[2] == '1' && rp[3] == '2') + else if (rp[0] == '\\' && rp[1] == '0' && rp[2] == '1' && rp[3] == '1') { /* \012 is a TAB. */ *wp++ = '\t'; rp += 3; } + else if (rp[0] == '\\' && rp[1] == '0' && rp[2] == '1' && rp[3] == '2') + { + /* \012 is a NEWLINE. */ + *wp++ = '\n'; + rp += 3; + } else if (rp[0] == '\\' && rp[1] == '\\') { /* We have to escape \\ to be able to represent all characters. */ diff --git a/posix/regex_internal.c b/posix/regex_internal.c index f78ec79e65..1fd3e164ef 100644 --- a/posix/regex_internal.c +++ b/posix/regex_internal.c @@ -62,17 +62,14 @@ re_string_allocate (pstr, str, len, init_len, trans, icase, dfa) init_len = dfa->mb_cur_max; init_buf_len = (len + 1 < init_len) ? len + 1: init_len; re_string_construct_common (str, len, pstr, trans, icase, dfa); - pstr->stop = pstr->len; ret = re_string_realloc_buffers (pstr, init_buf_len); if (BE (ret != REG_NOERROR, 0)) return ret; - pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case - : (unsigned char *) str); - pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case; - pstr->valid_len = (MBS_CASE_ALLOCATED (pstr) || MBS_ALLOCATED (pstr) - || dfa->mb_cur_max > 1) ? pstr->valid_len : len; + pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; + pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len; + pstr->valid_raw_len = pstr->valid_len; return REG_NOERROR; } @@ -88,9 +85,6 @@ re_string_construct (pstr, str, len, trans, icase, dfa) { reg_errcode_t ret; re_string_construct_common (str, len, pstr, trans, icase, dfa); - pstr->stop = pstr->len; - /* Set 0 so that this function can initialize whole buffers. */ - pstr->valid_len = 0; if (len > 0) { @@ -98,15 +92,27 @@ re_string_construct (pstr, str, len, trans, icase, dfa) if (BE (ret != REG_NOERROR, 0)) return ret; } - pstr->mbs_case = (MBS_CASE_ALLOCATED (pstr) ? pstr->mbs_case - : (unsigned char *) str); - pstr->mbs = MBS_ALLOCATED (pstr) ? pstr->mbs : pstr->mbs_case; + pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; if (icase) { #ifdef RE_ENABLE_I18N if (dfa->mb_cur_max > 1) - build_wcs_upper_buffer (pstr); + { + while (1) + { + ret = build_wcs_upper_buffer (pstr); + if (BE (ret != REG_NOERROR, 0)) + return ret; + if (pstr->valid_raw_len >= len) + break; + if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max) + break; + ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2); + if (BE (ret != REG_NOERROR, 0)) + return ret; + } + } else #endif /* RE_ENABLE_I18N */ build_upper_buffer (pstr); @@ -122,12 +128,13 @@ re_string_construct (pstr, str, len, trans, icase, dfa) if (trans != NULL) re_string_translate_buffer (pstr); else - pstr->valid_len = len; + { + pstr->valid_len = pstr->bufs_len; + pstr->valid_raw_len = pstr->bufs_len; + } } } - /* Initialized whole buffers, then valid_len == bufs_len. */ - pstr->valid_len = pstr->bufs_len; return REG_NOERROR; } @@ -145,9 +152,16 @@ re_string_realloc_buffers (pstr, new_buf_len) if (BE (new_array == NULL, 0)) return REG_ESPACE; pstr->wcs = new_array; + if (pstr->offsets != NULL) + { + int *new_array = re_realloc (pstr->offsets, int, new_buf_len); + if (BE (new_array == NULL, 0)) + return REG_ESPACE; + pstr->offsets = new_array; + } } #endif /* RE_ENABLE_I18N */ - if (MBS_ALLOCATED (pstr)) + if (pstr->mbs_allocated) { unsigned char *new_array = re_realloc (pstr->mbs, unsigned char, new_buf_len); @@ -155,16 +169,6 @@ re_string_realloc_buffers (pstr, new_buf_len) return REG_ESPACE; pstr->mbs = new_array; } - if (MBS_CASE_ALLOCATED (pstr)) - { - unsigned char *new_array = re_realloc (pstr->mbs_case, unsigned char, - new_buf_len); - if (BE (new_array == NULL, 0)) - return REG_ESPACE; - pstr->mbs_case = new_array; - if (!MBS_ALLOCATED (pstr)) - pstr->mbs = pstr->mbs_case; - } pstr->bufs_len = new_buf_len; return REG_NOERROR; } @@ -182,11 +186,15 @@ re_string_construct_common (str, len, pstr, trans, icase, dfa) memset (pstr, '\0', sizeof (re_string_t)); pstr->raw_mbs = (const unsigned char *) str; pstr->len = len; + pstr->raw_len = len; pstr->trans = trans; pstr->icase = icase ? 1 : 0; + pstr->mbs_allocated = (trans != NULL || icase); pstr->mb_cur_max = dfa->mb_cur_max; pstr->is_utf8 = dfa->is_utf8; pstr->map_notascii = dfa->map_notascii; + pstr->stop = pstr->len; + pstr->raw_stop = pstr->stop; } #ifdef RE_ENABLE_I18N @@ -206,18 +214,39 @@ static void build_wcs_buffer (pstr) re_string_t *pstr; { +#ifdef _LIBC + unsigned char buf[pstr->mb_cur_max]; +#else + unsigned char buf[64]; +#endif mbstate_t prev_st; int byte_idx, end_idx, mbclen, remain_len; + /* Build the buffers from pstr->valid_len to either pstr->len or pstr->bufs_len. */ - end_idx = (pstr->bufs_len > pstr->len)? pstr->len : pstr->bufs_len; + end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; for (byte_idx = pstr->valid_len; byte_idx < end_idx;) { wchar_t wc; + const char *p; + remain_len = end_idx - byte_idx; prev_st = pstr->cur_state; - mbclen = mbrtowc (&wc, ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx - + byte_idx), remain_len, &pstr->cur_state); + /* Apply the translation if we need. */ + if (BE (pstr->trans != NULL, 0)) + { + int i, ch; + + for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) + { + ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i]; + buf[i] = pstr->trans[ch]; + } + p = (const char *) buf; + } + else + p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx; + mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state); if (BE (mbclen == (size_t) -2, 0)) { /* The buffer doesn't have enough space, finish to build. */ @@ -229,15 +258,11 @@ build_wcs_buffer (pstr) /* We treat these cases as a singlebyte character. */ mbclen = 1; wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; + if (BE (pstr->trans != NULL, 0)) + wc = pstr->trans[wc]; pstr->cur_state = prev_st; } - /* Apply the translation if we need. */ - if (pstr->trans != NULL && mbclen == 1) - { - int ch = pstr->trans[pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]]; - pstr->mbs_case[byte_idx] = ch; - } /* Write wide character and padding. */ pstr->wcs[byte_idx++] = wc; /* Write paddings. */ @@ -245,61 +270,83 @@ build_wcs_buffer (pstr) pstr->wcs[byte_idx++] = WEOF; } pstr->valid_len = byte_idx; + pstr->valid_raw_len = byte_idx; } /* Build wide character buffer PSTR->WCS like build_wcs_buffer, but for REG_ICASE. */ -static void +static int build_wcs_upper_buffer (pstr) re_string_t *pstr; { mbstate_t prev_st; - int byte_idx, end_idx, mbclen, remain_len; - /* Build the buffers from pstr->valid_len to either pstr->len or - pstr->bufs_len. */ - end_idx = (pstr->bufs_len > pstr->len)? pstr->len : pstr->bufs_len; + int src_idx, byte_idx, end_idx, mbclen, remain_len; +#ifdef _LIBC + unsigned char buf[pstr->mb_cur_max]; +#else + unsigned char buf[64]; +#endif + + byte_idx = pstr->valid_len; + end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; #ifdef _LIBC /* The following optimization assumes that the wchar_t encoding is always ISO 10646. */ - if (! pstr->map_notascii && pstr->trans == NULL) - for (byte_idx = pstr->valid_len; byte_idx < end_idx;) - if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]) - && mbsinit (&pstr->cur_state)) - { - /* In case of a singlebyte character. */ - pstr->mbs[byte_idx] - = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]); - /* The next step uses the assumption that wchar_t is encoded - with ISO 10646: all ASCII values can be converted like this. */ - pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx]; - ++byte_idx; - } - else + if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed) + { + while (byte_idx < end_idx) { wchar_t wc; + + if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]) + && mbsinit (&pstr->cur_state)) + { + /* In case of a singlebyte character. */ + pstr->mbs[byte_idx] + = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]); + /* The next step uses the assumption that wchar_t is encoded + with ISO 10646: all ASCII values can be converted like + this. */ + pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx]; + ++byte_idx; + continue; + } + remain_len = end_idx - byte_idx; prev_st = pstr->cur_state; mbclen = mbrtowc (&wc, ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx), remain_len, &pstr->cur_state); - if (BE (mbclen > 1, 1)) + if (BE (mbclen > 0, 1)) { + wchar_t wcu = wc; if (iswlower (wc)) - wcrtomb ((char *) pstr->mbs + byte_idx, towupper (wc), - &prev_st); + { + int mbcdlen; + + wcu = towupper (wc); + mbcdlen = wcrtomb (buf, wcu, &prev_st); + if (BE (mbclen == mbcdlen, 1)) + memcpy (pstr->mbs + byte_idx, buf, mbclen); + else + { + src_idx = byte_idx; + goto offsets_needed; + } + } else memcpy (pstr->mbs + byte_idx, pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen); - pstr->wcs[byte_idx++] = towupper (wc); + pstr->wcs[byte_idx++] = wcu; /* Write paddings. */ for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) pstr->wcs[byte_idx++] = WEOF; } else if (mbclen == (size_t) -1 || mbclen == 0) { - /* It is an invalid character. Just use the byte. */ + /* It is an invalid character or '\0'. Just use the byte. */ int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; pstr->mbs[byte_idx] = ch; /* And also cast it to wide char. */ @@ -314,48 +361,116 @@ build_wcs_upper_buffer (pstr) break; } } + pstr->valid_len = byte_idx; + pstr->valid_raw_len = byte_idx; + return REG_NOERROR; + } else #endif - for (byte_idx = pstr->valid_len; byte_idx < end_idx;) + for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;) { wchar_t wc; + const char *p; +offsets_needed: remain_len = end_idx - byte_idx; prev_st = pstr->cur_state; - mbclen = mbrtowc (&wc, - ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx - + byte_idx), remain_len, &pstr->cur_state); - if (mbclen == 1) + if (BE (pstr->trans != NULL, 0)) { - /* In case of a singlebyte character. */ - int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; - /* Apply the translation if we need. */ - if (BE (pstr->trans != NULL, 0) && mbclen == 1) + int i, ch; + + for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) { - ch = pstr->trans[ch]; - pstr->mbs_case[byte_idx] = ch; + ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i]; + buf[i] = pstr->trans[ch]; } - pstr->wcs[byte_idx] = towupper (wc); - pstr->mbs[byte_idx++] = toupper (ch); - if (BE (mbclen == (size_t) -1, 0)) - pstr->cur_state = prev_st; + p = (const char *) buf; } - else if (BE (mbclen != (size_t) -2, 1)) + else + p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx; + mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state); + if (BE (mbclen > 0, 1)) { + wchar_t wcu = wc; if (iswlower (wc)) - wcrtomb ((char *) pstr->mbs + byte_idx, towupper (wc), &prev_st); + { + int mbcdlen; + + wcu = towupper (wc); + mbcdlen = wcrtomb (buf, wcu, &prev_st); + if (BE (mbclen == mbcdlen, 1)) + memcpy (pstr->mbs + byte_idx, buf, mbclen); + else + { + int i; + + if (byte_idx + mbcdlen > pstr->bufs_len) + { + pstr->cur_state = prev_st; + break; + } + + if (pstr->offsets == NULL) + { + pstr->offsets = re_malloc (int, pstr->bufs_len); + + if (pstr->offsets == NULL) + return REG_ESPACE; + } + if (!pstr->offsets_needed) + { + for (i = 0; i < byte_idx; ++i) + pstr->offsets[i] = i; + pstr->offsets_needed = 1; + } + + memcpy (pstr->mbs + byte_idx, buf, mbcdlen); + pstr->wcs[byte_idx] = wcu; + pstr->offsets[byte_idx] = src_idx; + for (i = 1; i < mbcdlen; ++i) + { + pstr->offsets[byte_idx + i] + = src_idx + (i < mbclen ? i : mbclen - 1); + pstr->wcs[byte_idx + i] = WEOF; + } + pstr->len += mbcdlen - mbclen; + if (pstr->raw_stop > src_idx) + pstr->stop += mbcdlen - mbclen; + end_idx = (pstr->bufs_len > pstr->len) + ? pstr->len : pstr->bufs_len; + byte_idx += mbcdlen; + src_idx += mbclen; + continue; + } + } else - memcpy (pstr->mbs + byte_idx, - pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen); - pstr->wcs[byte_idx++] = towupper (wc); + memcpy (pstr->mbs + byte_idx, p, mbclen); + + if (BE (pstr->offsets_needed != 0, 0)) + { + int i; + for (i = 0; i < mbclen; ++i) + pstr->offsets[byte_idx + i] = src_idx + i; + } + src_idx += mbclen; + + pstr->wcs[byte_idx++] = wcu; /* Write paddings. */ for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) pstr->wcs[byte_idx++] = WEOF; } else if (mbclen == (size_t) -1 || mbclen == 0) { - /* It is an invalid character. Just use the byte. */ - int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; + /* It is an invalid character or '\0'. Just use the byte. */ + int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx]; + + if (BE (pstr->trans != NULL, 0)) + ch = pstr->trans [ch]; pstr->mbs[byte_idx] = ch; + + if (BE (pstr->offsets_needed != 0, 0)) + pstr->offsets[byte_idx] = src_idx; + ++src_idx; + /* And also cast it to wide char. */ pstr->wcs[byte_idx++] = (wchar_t) ch; if (BE (mbclen == (size_t) -1, 0)) @@ -369,6 +484,8 @@ build_wcs_upper_buffer (pstr) } } pstr->valid_len = byte_idx; + pstr->valid_raw_len = src_idx; + return REG_NOERROR; } /* Skip characters until the index becomes greater than NEW_RAW_IDX. @@ -385,7 +502,7 @@ re_string_skip_chars (pstr, new_raw_idx, last_wc) wchar_t wc = 0; /* Skip the characters which are not necessary to check. */ - for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_len; + for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len; rawbuf_idx < new_raw_idx;) { int remain_len; @@ -420,17 +537,15 @@ build_upper_buffer (pstr) for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx) { int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx]; - if (pstr->trans != NULL) - { - ch = pstr->trans[ch]; - pstr->mbs_case[char_idx] = ch; - } + if (BE (pstr->trans != NULL, 0)) + ch = pstr->trans[ch]; if (islower (ch)) pstr->mbs[char_idx] = toupper (ch); else pstr->mbs[char_idx] = ch; } pstr->valid_len = char_idx; + pstr->valid_raw_len = char_idx; } /* Apply TRANS to the buffer in PSTR. */ @@ -445,10 +560,11 @@ re_string_translate_buffer (pstr) for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx) { int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx]; - pstr->mbs_case[buf_idx] = pstr->trans[ch]; + pstr->mbs[buf_idx] = pstr->trans[ch]; } pstr->valid_len = buf_idx; + pstr->valid_raw_len = buf_idx; } /* This function re-construct the buffers. @@ -468,14 +584,15 @@ re_string_reconstruct (pstr, idx, eflags, newline) if (pstr->mb_cur_max > 1) memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); #endif /* RE_ENABLE_I18N */ - pstr->len += pstr->raw_mbs_idx; - pstr->stop += pstr->raw_mbs_idx; - pstr->valid_len = pstr->raw_mbs_idx = 0; + pstr->len = pstr->raw_len; + pstr->stop = pstr->raw_stop; + pstr->valid_len = 0; + pstr->raw_mbs_idx = 0; + pstr->valid_raw_len = 0; + pstr->offsets_needed = 0; pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF : CONTEXT_NEWLINE | CONTEXT_BEGBUF); - if (!MBS_CASE_ALLOCATED (pstr)) - pstr->mbs_case = (unsigned char *) pstr->raw_mbs; - if (!MBS_ALLOCATED (pstr) && !MBS_CASE_ALLOCATED (pstr)) + if (!pstr->mbs_allocated) pstr->mbs = (unsigned char *) pstr->raw_mbs; offset = idx; } @@ -483,7 +600,13 @@ re_string_reconstruct (pstr, idx, eflags, newline) if (offset != 0) { /* Are the characters which are already checked remain? */ - if (offset < pstr->valid_len) + if (offset < pstr->valid_raw_len +#ifdef RE_ENABLE_I18N + /* Handling this would enlarge the code too much. + Accept a slowdown in that case. */ + && pstr->offsets_needed == 0 +#endif + ) { /* Yes, move them to the front of the buffer. */ pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags, @@ -493,13 +616,11 @@ re_string_reconstruct (pstr, idx, eflags, newline) memmove (pstr->wcs, pstr->wcs + offset, (pstr->valid_len - offset) * sizeof (wint_t)); #endif /* RE_ENABLE_I18N */ - if (MBS_ALLOCATED (pstr)) + if (pstr->mbs_allocated) memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset); - if (MBS_CASE_ALLOCATED (pstr)) - memmove (pstr->mbs_case, pstr->mbs_case + offset, - pstr->valid_len - offset); pstr->valid_len -= offset; + pstr->valid_raw_len -= offset; #if DEBUG assert (pstr->valid_len > 0); #endif @@ -507,16 +628,26 @@ re_string_reconstruct (pstr, idx, eflags, newline) else { /* No, skip all characters until IDX. */ +#ifdef RE_ENABLE_I18N + if (BE (pstr->offsets_needed, 0)) + { + pstr->len = pstr->raw_len - idx + offset; + pstr->stop = pstr->raw_stop - idx + offset; + pstr->offsets_needed = 0; + } +#endif pstr->valid_len = 0; + pstr->valid_raw_len = 0; #ifdef RE_ENABLE_I18N if (pstr->mb_cur_max > 1) { int wcs_idx; wint_t wc = WEOF; +#ifdef _LIBC if (pstr->is_utf8) { - const unsigned char *raw, *p, *end; + const unsigned char *raw, *p, *q, *end; /* Special case UTF-8. Multi-byte chars start with any byte other than 0x80 - 0xbf. */ @@ -527,13 +658,22 @@ re_string_reconstruct (pstr, idx, eflags, newline) { mbstate_t cur_state; wchar_t wc2; - int mlen; + int mlen = raw + pstr->len - p; + unsigned char buf[6]; + q = p; + if (BE (pstr->trans != NULL, 0)) + { + int i = mlen < 6 ? mlen : 6; + while (--i >= 0) + buf[i] = pstr->trans[p[i]]; + q = buf; + } /* XXX Don't use mbrtowc, we know which conversion to use (UTF-8 -> UCS4). */ memset (&cur_state, 0, sizeof (cur_state)); - mlen = mbrtowc (&wc2, p, raw + pstr->len - p, - &cur_state) - (raw + offset - p); + mlen = mbrtowc (&wc2, p, mlen, &cur_state) + - (raw + offset - p); if (mlen >= 0) { memset (&pstr->cur_state, '\0', @@ -544,12 +684,17 @@ re_string_reconstruct (pstr, idx, eflags, newline) break; } } +#endif if (wc == WEOF) pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx; - for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx) - pstr->wcs[wcs_idx] = WEOF; - if (pstr->trans && wc <= 0xff) - wc = pstr->trans[wc]; + if (BE (pstr->valid_len, 0)) + { + for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx) + pstr->wcs[wcs_idx] = WEOF; + if (pstr->mbs_allocated) + memset (pstr->mbs, 255, pstr->valid_len); + } + pstr->valid_raw_len = pstr->valid_len; pstr->tip_context = (IS_WIDE_WORD_CHAR (wc) ? CONTEXT_WORD : ((newline && IS_WIDE_NEWLINE (wc)) ? CONTEXT_NEWLINE : 0)); @@ -565,13 +710,8 @@ re_string_reconstruct (pstr, idx, eflags, newline) ? CONTEXT_NEWLINE : 0)); } } - if (!MBS_CASE_ALLOCATED (pstr)) - { - pstr->mbs_case += offset; - /* In case of !MBS_ALLOCATED && !MBS_CASE_ALLOCATED. */ - if (!MBS_ALLOCATED (pstr)) - pstr->mbs += offset; - } + if (!pstr->mbs_allocated) + pstr->mbs += offset; } pstr->raw_mbs_idx = idx; pstr->len -= offset; @@ -582,7 +722,11 @@ re_string_reconstruct (pstr, idx, eflags, newline) if (pstr->mb_cur_max > 1) { if (pstr->icase) - build_wcs_upper_buffer (pstr); + { + int ret = build_wcs_upper_buffer (pstr); + if (BE (ret != REG_NOERROR, 0)) + return ret; + } else build_wcs_buffer (pstr); } @@ -601,17 +745,95 @@ re_string_reconstruct (pstr, idx, eflags, newline) return REG_NOERROR; } +static unsigned char +re_string_peek_byte_case (const re_string_t *pstr, + int idx) +{ + int ch, off; + + /* Handle the common (easiest) cases first. */ + if (BE (!pstr->icase, 1)) + return re_string_peek_byte (pstr, idx); + +#ifdef RE_ENABLE_I18N + if (pstr->mb_cur_max > 1 + && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx)) + return re_string_peek_byte (pstr, idx); +#endif + + off = pstr->cur_idx + idx; +#ifdef RE_ENABLE_I18N + if (pstr->offsets_needed) + off = pstr->offsets[off]; +#endif + + ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; + if (pstr->trans) + ch = pstr->trans[ch]; + +#ifdef RE_ENABLE_I18N + /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I + this function returns CAPITAL LETTER I instead of first byte of + DOTLESS SMALL LETTER I. The latter would confuse the parser, + since peek_byte_case doesn't advance cur_idx in any way. */ + if (pstr->offsets_needed && !isascii (ch)) + return re_string_peek_byte (pstr, idx); +#endif + + return ch; +} + +static unsigned char +re_string_fetch_byte_case (re_string_t *pstr) +{ + int ch, off; + + if (BE (!pstr->icase, 1)) + return re_string_fetch_byte (pstr); + +#ifdef RE_ENABLE_I18N + if (pstr->offsets_needed) + { + /* For tr_TR.UTF-8 [[:islower:]] there is + [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip + in that case the whole multi-byte character and return + the original letter. On the other side, with + [[: DOTLESS SMALL LETTER I return [[:I, as doing + anything else would complicate things too much. */ + + if (!re_string_first_byte (pstr, pstr->cur_idx)) + return re_string_fetch_byte (pstr); + + off = pstr->offsets[pstr->cur_idx]; + ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; + if (pstr->trans) + ch = pstr->trans[ch]; + + if (! isascii (ch)) + return re_string_fetch_byte (pstr); + + re_string_skip_bytes (pstr, + re_string_char_size_at (pstr, pstr->cur_idx)); + return ch; + } +#endif + + ch = pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++]; + if (pstr->trans) + ch = pstr->trans[ch]; + return ch; +} + static void re_string_destruct (pstr) re_string_t *pstr; { #ifdef RE_ENABLE_I18N re_free (pstr->wcs); + re_free (pstr->offsets); #endif /* RE_ENABLE_I18N */ - if (MBS_ALLOCATED (pstr)) + if (pstr->mbs_allocated) re_free (pstr->mbs); - if (MBS_CASE_ALLOCATED (pstr)) - re_free (pstr->mbs_case); } /* Return the context at IDX in INPUT. */ -- cgit v1.2.3