aboutsummaryrefslogtreecommitdiff
path: root/wcsmbs
diff options
context:
space:
mode:
Diffstat (limited to 'wcsmbs')
-rw-r--r--wcsmbs/btowc.c10
-rw-r--r--wcsmbs/mbrlen.c3
-rw-r--r--wcsmbs/mbrtowc.c111
-rw-r--r--wcsmbs/mbsinit.c18
-rw-r--r--wcsmbs/mbsrtowcs.c114
-rw-r--r--wcsmbs/wchar.h20
-rw-r--r--wcsmbs/wcrtomb.c62
-rw-r--r--wcsmbs/wcsrtombs.c95
-rw-r--r--wcsmbs/wctob.c9
9 files changed, 332 insertions, 110 deletions
diff --git a/wcsmbs/btowc.c b/wcsmbs/btowc.c
index 062be7ec02..2f13cc7ce4 100644
--- a/wcsmbs/btowc.c
+++ b/wcsmbs/btowc.c
@@ -21,16 +21,14 @@ Boston, MA 02111-1307, USA. */
#include <wchar.h>
+/* We use UTF8 encoding for multibyte strings and therefore a valid
+ one byte multibyte string only can have a value from 0 to 0x7f. */
wint_t
btowc (c)
int c;
{
- /*************************************************************\
- |* This is no complete implementation. While the multi-byte *|
- |* character handling is not finished this will do. *|
- \*************************************************************/
- if (WEOF != (wint_t) EOF)
+ if (WEOF != (wint_t) EOF || c < 0 || c > 0x7f)
return WEOF;
else
- return c;
+ return (wint_t) c;
}
diff --git a/wcsmbs/mbrlen.c b/wcsmbs/mbrlen.c
index a50631e8d1..c5a27116be 100644
--- a/wcsmbs/mbrlen.c
+++ b/wcsmbs/mbrlen.c
@@ -26,10 +26,11 @@ static mbstate_t internal;
size_t
-mbrlen (s, n, ps)
+__mbrlen (s, n, ps)
const char *s;
size_t n;
mbstate_t *ps;
{
return mbrtowc (NULL, s, n, ps ?: &internal);
}
+weak_alias (__mbrlen, mbrlen)
diff --git a/wcsmbs/mbrtowc.c b/wcsmbs/mbrtowc.c
index 2c4b0779da..9e70a0b2c9 100644
--- a/wcsmbs/mbrtowc.c
+++ b/wcsmbs/mbrtowc.c
@@ -1,6 +1,6 @@
/* Copyright (C) 1996 Free Software Foundation, Inc.
This file is part of the GNU C Library.
-Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
@@ -17,50 +17,115 @@ License along with the GNU C Library; see the file COPYING.LIB. If
not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
+#include <errno.h>
#include <wchar.h>
+#ifndef EILSEQ
+#define EILSEQ EINVAL
+#endif
+
static mbstate_t internal;
size_t
-mbrtowc (pwc, s, n, ps)
- wchar_t *pwc;
- const char *s;
- size_t n;
- mbstate_t *ps;
+mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
{
wchar_t to_wide;
+ size_t used = 0;
if (ps == NULL)
ps = &internal;
- /*************************************************************\
- |* This is no complete implementation. While the multi-byte *|
- |* character handling is not finished this will do. *|
- \*************************************************************/
-
if (s == NULL)
{
+ /* See first paragraph of description in 7.16.6.3.2. */
pwc = NULL;
s = "";
n = 1;
}
- if (n == 0)
- return (size_t) -2;
+ if (n > 0)
+ {
+ if (ps->count == 0)
+ {
+ unsigned char byte = (unsigned char) *s++;
+ ++used;
- /* For now. */
- to_wide = (wchar_t) *s;
+ /* We must look for a possible first byte of a UTF8 sequence. */
+ if (byte < 0x80)
+ {
+ /* One byte sequence. */
+ if (pwc != NULL)
+ *pwc = (wchar_t) byte;
+ return byte ? used : 0;
+ }
- if (pwc != NULL)
- *pwc = to_wide;
+ if ((byte & 0xc0) == 0x80 || (byte & 0xfe) == 0xfe)
+ {
+ /* Oh, oh. An encoding error. */
+ errno = EILSEQ;
+ return (size_t) -1;
+ }
- if (pwc == L'\0')
- {
- *ps = 0; /* This is required. */
- return 0;
+ if ((byte & 0xe0) == 0xc0)
+ {
+ /* We expect two bytes. */
+ ps->count = 1;
+ ps->value = byte & 0x1f;
+ }
+ else if ((byte & 0xf0) == 0xe0)
+ {
+ /* We expect three bytes. */
+ ps->count = 2;
+ ps->value = byte & 0x0f;
+ }
+ else if ((byte & 0xf8) == 0xf0)
+ {
+ /* We expect four bytes. */
+ ps->count = 3;
+ ps->value = byte & 0x07;
+ }
+ else if ((byte & 0xfc) == 0xf8)
+ {
+ /* We expect five bytes. */
+ ps->count = 4;
+ ps->value = byte & 0x03;
+ }
+ else
+ {
+ /* We expect six bytes. */
+ ps->count = 5;
+ ps->value = byte & 0x01;
+ }
+ }
+
+ /* We know we have to handle a multibyte character and there are
+ some more bytes to read. */
+ while (used < n)
+ {
+ /* The second to sixths byte must be of the form 10xxxxxx. */
+ unsigned char byte = (unsigned char) *s++;
+ ++used;
+
+ if ((byte & 0xc0) != 0x80)
+ {
+ /* Oh, oh. An encoding error. */
+ errno = EILSEQ;
+ return (size_t) -1;
+ }
+
+ ps->value <<= 6;
+ ps->value |= byte & 0x3f;
+
+ if (--ps->count == 0)
+ {
+ /* The character is finished. */
+ if (pwc != NULL)
+ *pwc = (wchar_t) ps->value;
+ return ps->value ? used : 0;
+ }
+ }
}
- /* Return code (size_t)-1 cannot happend for now. */
- return 1;
+ return (size_t) -2;
}
diff --git a/wcsmbs/mbsinit.c b/wcsmbs/mbsinit.c
index efbfd09347..f56ce20331 100644
--- a/wcsmbs/mbsinit.c
+++ b/wcsmbs/mbsinit.c
@@ -1,6 +1,6 @@
/* Copyright (C) 1996 Free Software Foundation, Inc.
This file is part of the GNU C Library.
-Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
@@ -20,15 +20,17 @@ Boston, MA 02111-1307, USA. */
#include <string.h>
#include <wchar.h>
-
+/* In GNU libc the conversion functions only can convert between the
+ fixed wide character representation and the multibyte
+ representation of the same character set. Since we use ISO 10646
+ in UCS4 encoding for wide characters the best solution for
+ multibyte characters is the UTF8 encoding. I.e., the only state
+ information is a counter of the processed bytes so far and the
+ value collected so far. Especially, we don't have different shift
+ states. */
int
mbsinit (ps)
const mbstate_t *ps;
{
- /*************************************************************\
- |* This is no complete implementation. While the multi-byte *|
- |* character handling is not finished this will do. *|
- \*************************************************************/
-
- return ps == NULL || *ps == 0;
+ return ps == NULL || ps->count == 0;
}
diff --git a/wcsmbs/mbsrtowcs.c b/wcsmbs/mbsrtowcs.c
index dc026b7252..712b199271 100644
--- a/wcsmbs/mbsrtowcs.c
+++ b/wcsmbs/mbsrtowcs.c
@@ -1,6 +1,6 @@
/* Copyright (C) 1996 Free Software Foundation, Inc.
This file is part of the GNU C Library.
-Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
@@ -17,9 +17,16 @@ License along with the GNU C Library; see the file COPYING.LIB. If
not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
+#include <errno.h>
#include <wchar.h>
+#ifndef EILSEQ
+#define EILSEQ EINVAL
+#endif
+
+/* We don't need the state really because we don't have shift states
+ to maintain between calls to this function. */
static mbstate_t internal;
size_t
@@ -29,35 +36,102 @@ mbsrtowcs (dst, src, len, ps)
size_t len;
mbstate_t *ps;
{
- size_t result = 0;
+ size_t written = 0;
+ const char *run = *src;
if (ps == NULL)
ps = &internal;
- /*************************************************************\
- |* This is no complete implementation. While the multi-byte *|
- |* character handling is not finished this will do. *|
- \*************************************************************/
+ if (dst == NULL)
+ /* The LEN parameter has to be ignored if we don't actually write
+ anything. */
+ len = ~0;
- while (len > 0 && **src != '\0')
+ /* Copy all words. */
+ while (written < len)
{
- /* For now there is no possibly illegal MB char sequence. */
- if (dst != NULL)
- dst[result] = (wchar_t) **src;
- ++result;
- ++(*src);
- --len;
- }
+ wchar_t value;
+ size_t count;
+ unsigned char byte = *run++;
- if (len > 0)
- {
+ /* We expect a start of a new multibyte character. */
+ if (byte < 0x80)
+ {
+ /* One byte sequence. */
+ count = 0;
+ value = byte;
+ }
+ else if ((byte & 0xe0) == 0xc0)
+ {
+ count = 1;
+ value = byte & 0x1f;
+ }
+ else if ((byte & 0xf0) == 0xe0)
+ {
+ /* We expect three bytes. */
+ count = 2;
+ value = byte & 0x0f;
+ }
+ else if ((byte & 0xf8) == 0xf0)
+ {
+ /* We expect four bytes. */
+ count = 3;
+ value = byte & 0x07;
+ }
+ else if ((byte & 0xfc) == 0xf8)
+ {
+ /* We expect five bytes. */
+ count = 4;
+ value = byte & 0x03;
+ }
+ else if ((byte & 0xfe) == 0xfc)
+ {
+ /* We expect six bytes. */
+ count = 5;
+ value = byte & 0x01;
+ }
+ else
+ {
+ /* This is an illegal encoding. */
+ errno = EILSEQ;
+ return (size_t) -1;
+ }
+
+ /* Read the possible remaining bytes. */
+ while (count-- > 0)
+ {
+ byte = *run++;
+
+ if ((byte & 0xc0) != 0x80)
+ {
+ /* This is an illegal encoding. */
+ errno = EILSEQ;
+ return (size_t) -1;
+ }
+
+ value <<= 6;
+ value |= byte & 0x3f;
+ }
+
+ /* Store value is required. */
if (dst != NULL)
+ *dst++ = value;
+
+ /* The whole sequence is read. Check whether end of string is
+ reached. */
+ if (value == L'\0')
{
- dst[result] = L'\0';
- *ps = 0;
+ /* Found the end of the string. */
+ *src = NULL;
+ return written;
}
- *src = NULL;
+
+ /* Increment counter of produced words. */
+ ++written;
}
- return result;
+ /* Store address of next byte to process. */
+ *src = run;
+
+ return written;
}
diff --git a/wcsmbs/wchar.h b/wcsmbs/wchar.h
index cc821b8a50..806bafa655 100644
--- a/wcsmbs/wchar.h
+++ b/wcsmbs/wchar.h
@@ -48,7 +48,11 @@ typedef unsigned int wint_t;
/* Conversion state information. */
-typedef int mbstate_t; /* FIXME */
+typedef struct
+{
+ int count; /* Number of bytes needed for the current character. */
+ wint_t value; /* Value so far. */
+} mbstate_t;
#define WCHAR_MIN ((wchar_t) 0)
#define WCHAR_MAX (~WCHAR_MIN)
@@ -145,9 +149,6 @@ extern int wctob __P ((wint_t __c));
state. */
extern int mbsinit __P ((__const mbstate_t *__ps));
-/* Return number of bytes in multibyte character pointed to by S. */
-extern size_t mbrlen __P ((__const char *__s, size_t __n, mbstate_t *ps));
-
/* Write wide character representation of multibyte character pointed
to by S to PWC. */
extern size_t mbrtowc __P ((wchar_t *__pwc, __const char *__s, size_t __n,
@@ -156,6 +157,17 @@ extern size_t mbrtowc __P ((wchar_t *__pwc, __const char *__s, size_t __n,
/* Write multibyte representation of wide character WC to S. */
extern size_t wcrtomb __P ((char *__s, wchar_t __wc, mbstate_t *__ps));
+/* Return number of bytes in multibyte character pointed to by S. */
+extern size_t __mbrlen __P ((__const char *__s, size_t __n, mbstate_t *__ps));
+extern size_t mbrlen __P ((__const char *__s, size_t __n, mbstate_t *__ps));
+
+#if defined (__OPTIMIZE__) \
+ && (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7))
+/* Define inline function as optimization. */
+extern __inline size_t mbrlen (__const char *s, size_t n, mbstate_t *ps)
+{ return ps != NULL ? mbrtowc (NULL, s, n, ps) : __mbrlen (s, n, NULL); }
+#endif
+
/* Write wide character representation of multibyte chracter string SRC
to DST. */
extern size_t mbsrtowcs __P ((wchar_t *__dst, __const char **__src,
diff --git a/wcsmbs/wcrtomb.c b/wcsmbs/wcrtomb.c
index 9069fb105c..eb007a69b9 100644
--- a/wcsmbs/wcrtomb.c
+++ b/wcsmbs/wcrtomb.c
@@ -1,6 +1,6 @@
/* Copyright (C) 1996 Free Software Foundation, Inc.
This file is part of the GNU C Library.
-Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
@@ -24,46 +24,68 @@ Boston, MA 02111-1307, USA. */
#define EILSEQ EINVAL
#endif
+static const wchar_t encoding_mask[] =
+{
+ ~0x7ff, ~0xffff, ~0x1fffff, ~0x3ffffff
+};
+
+static const unsigned char encoding_byte[] =
+{
+ 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
+};
+/* The state is for this UTF8 encoding not used. */
static mbstate_t internal;
size_t
-wcrtomb (s, wc, ps)
- char *s;
- wchar_t wc;
- mbstate_t *ps;
+wcrtomb (char *s, wchar_t wc, mbstate_t *ps)
{
char fake[1];
+ size_t written = 0;
if (ps == NULL)
ps = &internal;
- /*************************************************************\
- |* This is no complete implementation. While the multi-byte *|
- |* character handling is not finished this will do. *|
- \*************************************************************/
-
if (s == NULL)
{
s = fake;
wc = L'\0';
}
- if (wc == L'\0')
+ /* Store the UTF8 representation of WC. */
+ if (wc < 0 || wc > 0x7fffffff)
{
- /* FIXME Write any shift sequence to get to *PS == NULL. */
- *ps = 0;
- *s = '\0';
+ /* This is no correct ISO 10646 character. */
+ errno = EILSEQ;
+ return (size_t) -1;
+ }
+
+ if (wc < 0x80)
+ {
+ /* It's a one byte sequence. */
+ if (s != NULL)
+ *s = (char) wc;
return 1;
}
- /* FIXME For now we don't handle real multi-byte encodings. */
- if ((wc & ~0xff) != 0)
+ for (written = 2; written < 6; ++written)
+ if ((wc & encoding_mask[written - 2]) == 0)
+ break;
+
+ if (s != NULL)
{
- errno = EILSEQ;
- return (size_t) -1;
+ size_t cnt = written;
+ s[0] = encoding_byte[cnt - 2];
+
+ --cnt;
+ do
+ {
+ s[cnt] = 0x80 | (wc & 0x3f);
+ wc >>= 6;
+ }
+ while (--cnt > 0);
+ s[0] |= wc;
}
- *s = (char) wc;
- return 1;
+ return written;
}
diff --git a/wcsmbs/wcsrtombs.c b/wcsmbs/wcsrtombs.c
index 9f1000937b..99ca6acc5b 100644
--- a/wcsmbs/wcsrtombs.c
+++ b/wcsmbs/wcsrtombs.c
@@ -1,6 +1,6 @@
/* Copyright (C) 1996 Free Software Foundation, Inc.
This file is part of the GNU C Library.
-Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>
+Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
@@ -25,6 +25,18 @@ Boston, MA 02111-1307, USA. */
#endif
+static const wchar_t encoding_mask[] =
+{
+ ~0x7ff, ~0xffff, ~0x1fffff, ~0x3ffffff
+};
+
+static const unsigned char encoding_byte[] =
+{
+ 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
+};
+
+/* We don't need the state really because we don't have shift states
+ to maintain between calls to this function. */
static mbstate_t internal;
size_t
@@ -34,40 +46,79 @@ wcsrtombs (dst, src, len, ps)
size_t len;
mbstate_t *ps;
{
- size_t result = 0;
+ size_t written = 0;
+ const wchar_t *run = *src;
if (ps == NULL)
ps = &internal;
- /*************************************************************\
- |* This is no complete implementation. While the multi-byte *|
- |* character handling is not finished this will do. *|
- \*************************************************************/
+ if (dst == NULL)
+ /* The LEN parameter has to be ignored if we don't actually write
+ anything. */
+ len = ~0;
- while (len > 0 && **src != L'\0')
+ while (written < len)
{
- if ((**src & ~0xff) != 0)
+ wchar_t wc = *run++;
+
+ if (wc < 0 || wc > 0x7fffffff)
{
+ /* This is no correct ISO 10646 character. */
errno = EILSEQ;
return (size_t) -1;
}
- if (dst != NULL)
- dst[result] = (char) **src;
- ++result;
- ++(*src);
- --len;
- }
-
- if (len > 0)
- {
- if (dst != NULL)
+ if (wc == L'\0')
+ {
+ /* Found the end. */
+ if (dst != NULL)
+ *dst = '\0';
+ *src = NULL;
+ return written;
+ }
+ else if (wc < 0x80)
{
- dst[result] = '\0';
- *ps = 0;
+ /* It's an one byte sequence. */
+ if (dst != NULL)
+ *dst++ = (char) wc;
+ ++written;
+ }
+ else
+ {
+ size_t step;
+
+ for (step = 2; step < 6; ++step)
+ if ((wc & encoding_mask[step - 2]) == 0)
+ break;
+
+ if (written + step >= len)
+ /* Too long. */
+ break;
+
+ if (dst != NULL)
+ {
+ size_t cnt = step;
+
+ dst[0] = encoding_byte[cnt - 2];
+
+ --cnt;
+ do
+ {
+ dst[cnt] = 0x80 | (wc & 0x3f);
+ wc >>= 6;
+ }
+ while (--cnt > 0);
+ dst[0] |= wc;
+
+ dst += step;
+ }
+
+ written += step;
}
- *src = NULL;
}
- return result;
+ /* Store position of first unprocessed word. */
+ *src = run;
+
+ return written;
}
diff --git a/wcsmbs/wctob.c b/wcsmbs/wctob.c
index c27bd6baba..f541a2e97b 100644
--- a/wcsmbs/wctob.c
+++ b/wcsmbs/wctob.c
@@ -21,14 +21,11 @@ Boston, MA 02111-1307, USA. */
#include <wchar.h>
+/* We use UTF8 encoding for multibyte strings and therefore a valid
+ one byte multibyte string only can have a value from 0 to 0x7f. */
int
wctob (c)
wint_t c;
{
- /*************************************************************\
- |* This is no complete implementation. While the multi-byte *|
- |* character handling is not finished this will do. *|
- \*************************************************************/
-
- return (c & ~0xff) == 0 ? c : EOF;
+ return (c >= 0 && c <= 0x7f) ? c : EOF;
}