diff options
author | Joseph Myers <joseph@codesourcery.com> | 2018-10-19 16:31:29 +0000 |
---|---|---|
committer | Joseph Myers <joseph@codesourcery.com> | 2018-10-19 16:31:29 +0000 |
commit | d0a74159792477e5922f53fa2aa6d58eb8265a14 (patch) | |
tree | fcce06511d2a91afa649f629cde15875b6fe6f6a | |
parent | f997b4be18f7e57d757d39e42f7715db26528aa0 (diff) | |
download | glibc-d0a74159792477e5922f53fa2aa6d58eb8265a14.tar glibc-d0a74159792477e5922f53fa2aa6d58eb8265a14.tar.gz glibc-d0a74159792477e5922f53fa2aa6d58eb8265a14.tar.bz2 glibc-d0a74159792477e5922f53fa2aa6d58eb8265a14.zip |
Handle surrogate pairs in c16rtomb (bug 23794, DR#488, C2X).
The c16rtomb implementation has:
// XXX The ISO C 11 spec I have does not say anything about handling
// XXX surrogates in this interface.
The DR#488 resolution, as applied to C2X, requires surrogate pairs to
be handled here (so the first call returns 0 and stores the high
surrogate in the mbstate_t, while the second call combines the
surrogates, produces a multibyte character and returns the number of
bytes written). This patch implements that. (mbrtoc16 already
handled producing surrogates as output.)
Tested for x86_64.
[BZ #23794]
* wcsmbs/c16rtomb.c (c16rtomb): Save first character of surrogate
pair and return 0 in that case, and use saved character to
interpret following character.
* wcsmbs/tst-c16-surrogate.c: New file.
* wcsmbs/Makefile (tests): Add tst-c16-surrogate.c.
[$(run-built-tests) = yes] ($(objpfx)tst-c16-surrogate.out):
Depend on $(gen-locales)
-rw-r--r-- | ChangeLog | 11 | ||||
-rw-r--r-- | wcsmbs/Makefile | 4 | ||||
-rw-r--r-- | wcsmbs/c16rtomb.c | 41 | ||||
-rw-r--r-- | wcsmbs/tst-c16-surrogate.c | 89 |
4 files changed, 141 insertions, 4 deletions
@@ -1,3 +1,14 @@ +2018-10-19 Joseph Myers <joseph@codesourcery.com> + + [BZ #23794] + * wcsmbs/c16rtomb.c (c16rtomb): Save first character of surrogate + pair and return 0 in that case, and use saved character to + interpret following character. + * wcsmbs/tst-c16-surrogate.c: New file. + * wcsmbs/Makefile (tests): Add tst-c16-surrogate.c. + [$(run-built-tests) = yes] ($(objpfx)tst-c16-surrogate.out): + Depend on $(gen-locales) + 2018-10-19 Ilya Yu. Malakhov <malakhov@mcst.ru> [BZ #23562] diff --git a/wcsmbs/Makefile b/wcsmbs/Makefile index 2e02b22c3b..a58d36ff8f 100644 --- a/wcsmbs/Makefile +++ b/wcsmbs/Makefile @@ -51,7 +51,8 @@ tests := tst-wcstof wcsmbs-tst1 tst-wcsnlen tst-btowc tst-mbrtowc \ tst-wcrtomb tst-wcpncpy tst-mbsrtowcs tst-wchar-h tst-mbrtowc2 \ tst-c16c32-1 wcsatcliff tst-wcstol-locale tst-wcstod-nan-locale \ tst-wcstod-round test-char-types tst-fgetwc-after-eof \ - tst-wcstod-nan-sign $(addprefix test-,$(strop-tests)) + tst-wcstod-nan-sign tst-c16-surrogate \ + $(addprefix test-,$(strop-tests)) include ../Rules @@ -68,6 +69,7 @@ $(objpfx)tst-wcrtomb.out: $(gen-locales) $(objpfx)wcsmbs-tst1.out: $(gen-locales) $(objpfx)tst-wcstol-locale.out: $(gen-locales) $(objpfx)tst-wcstod-nan-locale.out: $(gen-locales) +$(objpfx)tst-c16-surrogate.out: $(gen-locales) endif $(objpfx)tst-wcstod-round: $(libm) diff --git a/wcsmbs/c16rtomb.c b/wcsmbs/c16rtomb.c index 48a63d067b..74950d8173 100644 --- a/wcsmbs/c16rtomb.c +++ b/wcsmbs/c16rtomb.c @@ -26,7 +26,42 @@ static mbstate_t state; size_t c16rtomb (char *s, char16_t c16, mbstate_t *ps) { - // XXX The ISO C 11 spec I have does not say anything about handling - // XXX surrogates in this interface. - return wcrtomb (s, c16, ps ?: &state); + wchar_t wc = c16; + + if (ps == NULL) + ps = &state; + + if (s == NULL) + { + /* Reset any state relating to surrogate pairs. */ + ps->__count &= 0x7fffffff; + ps->__value.__wch = 0; + wc = 0; + } + + if (ps->__count & 0x80000000) + { + /* The previous call passed in the first surrogate of a + surrogate pair. */ + ps->__count &= 0x7fffffff; + if (wc >= 0xdc00 && wc < 0xe000) + wc = (0x10000 + + ((ps->__value.__wch & 0x3ff) << 10) + + (wc & 0x3ff)); + else + /* This is not a low surrogate; ensure an EILSEQ error by + trying to decode the high surrogate as a wide character on + its own. */ + wc = ps->__value.__wch; + ps->__value.__wch = 0; + } + else if (wc >= 0xd800 && wc < 0xdc00) + { + /* The high part of a surrogate pair. */ + ps->__count |= 0x80000000; + ps->__value.__wch = wc; + return 0; + } + + return wcrtomb (s, wc, ps); } diff --git a/wcsmbs/tst-c16-surrogate.c b/wcsmbs/tst-c16-surrogate.c new file mode 100644 index 0000000000..6a87ff8206 --- /dev/null +++ b/wcsmbs/tst-c16-surrogate.c @@ -0,0 +1,89 @@ +/* Test c16rtomb handling of surrogate pairs (DR#488, bug 23794). + Copyright (C) 2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <errno.h> +#include <locale.h> +#include <stdio.h> +#include <string.h> +#include <uchar.h> +#include <wchar.h> +#include <array_length.h> +#include <support/check.h> + +static int +do_test (void) +{ + TEST_VERIFY_EXIT (setlocale (LC_ALL, "de_DE.UTF-8") != NULL); + /* Test conversions of surrogate pairs. */ + for (char32_t c = 0x10000; c <= 0x10ffff; c += 0x123) + { + char32_t c_pos = c - 0x10000; + char16_t c_hi = (c_pos >> 10) + 0xd800; + char16_t c_lo = (c_pos & 0x3ff) + 0xdc00; + printf ("testing U+0x%08x (0x%x 0x%x)\n", + (unsigned int) c, (unsigned int) c_hi, (unsigned int) c_lo); + char buf[16] = { 0 }; + size_t ret_hi = c16rtomb (buf, c_hi, NULL); + TEST_COMPARE (ret_hi, 0); + size_t ret_lo = c16rtomb (buf, c_lo, NULL); + TEST_COMPARE (ret_lo, 4); + wchar_t wc = 0; + size_t ret_wc = mbrtowc (&wc, buf, 4, NULL); + TEST_COMPARE (ret_wc, 4); + TEST_COMPARE (wc, (wchar_t) c); + } + /* Test errors for invalid conversions. */ + static const char16_t err_cases[][2] = + { + /* High surrogate followed by non-surrogate. */ + { 0xd800, 0x1 }, + /* High surrogate followed by another high surrogate. */ + { 0xd800, 0xd800 }, + /* Low surrogate not following high surrogate. */ + { 0xdc00, 0 } + }; + for (size_t i = 0; i < array_length (err_cases); i++) + { + char16_t c_hi = err_cases[i][0]; + char16_t c_lo = err_cases[i][1]; + printf ("testing error case: 0x%x 0x%x\n", (unsigned int) c_hi, + (unsigned int) c_lo); + c16rtomb (NULL, 0, NULL); + char buf[16] = { 0 }; + errno = 0; + size_t ret_hi = c16rtomb (buf, c_hi, NULL); + if (c_lo == 0) + { + /* Unmatched low surrogate in first place. */ + TEST_COMPARE (ret_hi, (size_t) -1); + TEST_COMPARE (errno, EILSEQ); + } + else + { + /* High surrogate; error in second place. */ + TEST_COMPARE (ret_hi, 0); + errno = 0; + size_t ret_lo = c16rtomb (buf, c_lo, NULL); + TEST_COMPARE (ret_lo, (size_t) -1); + TEST_COMPARE (errno, EILSEQ); + } + } + return 0; +} + +#include <support/test-driver.c> |