aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog9
-rw-r--r--NEWS16
-rw-r--r--catgets/Makefile5
-rw-r--r--catgets/gencat.c282
4 files changed, 274 insertions, 38 deletions
diff --git a/ChangeLog b/ChangeLog
index 8641abe1fd..2161993d45 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2000-11-27 Ulrich Drepper <drepper@redhat.com>
+
+ * catgets/Makefile (test1.cat): Set LC_ALL, LOCPATH, and GCONV_PATH
+ for gencat run.
+ (libc.cat): Likewise.
+ * catgets/gencat.c: Implement handling of message catalogs encoded
+ with stateful character sets.
+ Based on a patch by Shinya Hanataka <hanataka@abyss.rim.or.jp>.
+
2000-11-26 Ulrich Drepper <drepper@redhat.com>
* sysdeps/unix/opendir.c (__opendir): Add cast to avoid warning.
diff --git a/NEWS b/NEWS
index e6391a7f5f..4b3a977238 100644
--- a/NEWS
+++ b/NEWS
@@ -1,4 +1,4 @@
-GNU C Library NEWS -- history of user-visible changes. 2000-08-13
+GNU C Library NEWS -- history of user-visible changes. 2000-11-27
Copyright (C) 1992-1999, 2000 Free Software Foundation, Inc.
See the end for copying conditions.
@@ -7,6 +7,20 @@ Please send GNU C library bug reports using the `glibcbug' script to
<bugs@gnu.org>. Questions and suggestions should be send to
<bug-glibc@gnu.org>.
+Version 2.2.1
+
+* The gencat program now parses the input file according to the charset
+ selected by the LC_CTYPE category. This is important for stateful
+ character sets. To make generating catalogs easier there is a way
+ to overwrite the charset selected by the locale: before the first
+ message or $ quote line the catalog can contain a line like
+
+ $ codeset=ISO-8859-2
+
+ to select the charset (ISO-8859-2 in this case).
+
+ Implemented by Shinya Hanataka and Ulrich Drepper.
+
Version 2.2
* Greg McGary added runtime support for bounds checking using gcc's
diff --git a/catgets/Makefile b/catgets/Makefile
index bc6575c245..caf8eec651 100644
--- a/catgets/Makefile
+++ b/catgets/Makefile
@@ -53,10 +53,13 @@ tests: $(objpfx)de/libc.cat $(objpfx)test1.cat
# This test just checks whether the program produces any error or not.
# The result is not tested.
$(objpfx)test1.cat: test1.msg $(objpfx)gencat
+ LC_ALL=hr_HR.ISO-8859-2 LOCPATH=$(common-objpfx)localedata \
+ GCONV_PATH=$(common-objpfx)iconvdata \
$(built-program-cmd) -H $(objpfx)test1.h $@ $<
$(objpfx)de/libc.cat: $(objpfx)de.msg $(objpfx)gencat
$(make-target-directory)
- $(built-program-cmd) $@ $<
+ LC_ALL=de_DE.ISO-8859-1 LOCPATH=$(common-objpfx)localedata \
+ GCONV_PATH=$(common-objpfx)iconvdata $(built-program-cmd) $@ $<
$(objpfx)tst-catgets.out: $(objpfx)de/libc.cat
# Generate a non-simple input file.
diff --git a/catgets/gencat.c b/catgets/gencat.c
index de6bdf65d8..0200ca44ef 100644
--- a/catgets/gencat.c
+++ b/catgets/gencat.c
@@ -22,11 +22,14 @@
#endif
#include <argp.h>
+#include <assert.h>
#include <ctype.h>
#include <endian.h>
#include <errno.h>
#include <error.h>
#include <fcntl.h>
+#include <iconv.h>
+#include <langinfo.h>
#include <locale.h>
#include <libintl.h>
#include <limits.h>
@@ -37,6 +40,7 @@
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
+#include <wchar.h>
#include "version.h"
@@ -79,7 +83,7 @@ struct catalog
struct set_list *all_sets;
struct set_list *current_set;
size_t total_messages;
- char quote_char;
+ wint_t quote_char;
int last_set;
struct obstack mem_pool;
@@ -137,6 +141,8 @@ static struct argp argp =
/* Wrapper functions with error checking for standard functions. */
extern void *xmalloc (size_t n);
extern void *xcalloc (size_t n, size_t s);
+extern void *xrealloc (void *o, size_t n);
+extern char *xstrdup (const char *);
/* Prototypes for local functions. */
static void error_print (void);
@@ -145,9 +151,11 @@ static struct catalog *read_input_file (struct catalog *current,
static void write_out (struct catalog *result, const char *output_name,
const char *header_name);
static struct set_list *find_set (struct catalog *current, int number);
-static void normalize_line (const char *fname, size_t line, char *string,
- char quote_char);
+static void normalize_line (const char *fname, size_t line, iconv_t cd,
+ wchar_t *string, wchar_t quote_char);
static void read_old (struct catalog *catalog, const char *file_name);
+static int open_conversion (const char *codesetp, iconv_t *cd_towcp,
+ iconv_t *cd_tombp);
int
@@ -260,6 +268,11 @@ read_input_file (struct catalog *current, const char *fname)
char *buf;
size_t len;
size_t line_number;
+ wchar_t *wbuf;
+ size_t wbufsize;
+ iconv_t cd_towc = (iconv_t) -1;
+ iconv_t cd_tomb = (iconv_t) -1;
+ char *codeset = NULL;
if (strcmp (fname, "-") == 0 || strcmp (fname, "/dev/stdin") == 0)
{
@@ -289,6 +302,10 @@ read_input_file (struct catalog *current, const char *fname)
buf = NULL;
len = 0;
line_number = 0;
+
+ wbufsize = 1024;
+ wbuf = (wchar_t *) xmalloc (wbufsize);
+
while (!feof (fp))
{
int continued;
@@ -328,7 +345,29 @@ read_input_file (struct catalog *current, const char *fname)
if (this_line[0] == '$')
{
if (isblank (this_line[1]))
- /* This is a comment line. Do nothing. */;
+ {
+ int cnt = 1;
+ while (isblank (this_line[cnt]))
+ ++cnt;
+ if (strncmp (&this_line[cnt], "codeset=", 8) != 0)
+ /* This is a comment line. Do nothing. */;
+ else if (codeset != NULL)
+ /* Ignore multiple codeset. */;
+ else
+ {
+ int start = cnt + 8;
+ cnt = start;
+ while (this_line[cnt] != '\0' && !isspace (this_line[cnt]))
+ ++cnt;
+ if (cnt != start)
+ {
+ int len = cnt - start;
+ codeset = xmalloc (len + 1);
+ *((char *) mempcpy (codeset, &this_line[start], len))
+ = '\0';
+ }
+ }
+ }
else if (strncmp (&this_line[1], "set", 3) == 0)
{
int cnt = sizeof ("set");
@@ -470,12 +509,44 @@ this is the first definition"));
}
else if (strncmp (&this_line[1], "quote", 5) == 0)
{
- int cnt = sizeof ("quote");
+ char buf[2];
+ char *bufptr;
+ size_t buflen;
+ char *wbufptr;
+ size_t wbuflen;
+ int cnt;
+
+ cnt = sizeof ("quote");
while (isspace (this_line[cnt]))
++cnt;
+
+ /* We need the conversion. */
+ if (cd_towc == (iconv_t) -1
+ && open_conversion (codeset, &cd_towc, &cd_tomb) != 0)
+ /* Something is wrong. */
+ goto out;
+
/* Yes, the quote char can be '\0'; this means no quote
- char. */
- current->quote_char = this_line[cnt];
+ char. The function using the information works on
+ wide characters so we have to convert it here. */
+ buf[0] = this_line[cnt];
+ buf[1] = '\0';
+ bufptr = buf;
+ buflen = 2;
+
+ wbufptr = (char *) wbuf;
+ wbuflen = wbufsize;
+
+ /* Flush the state. */
+ iconv (cd_towc, NULL, NULL, NULL, NULL);
+
+ iconv (cd_towc, &bufptr, &buflen, &wbufptr, &wbuflen);
+ if (buflen != 0 || (wchar_t *) wbufptr != &wbuf[2])
+ error_at_line (0, 0, fname, start_line,
+ gettext ("invalid quote character"));
+ else
+ /* Use the converted wide character. */
+ current->quote_char = wbuf[0];
}
else
{
@@ -568,15 +639,92 @@ duplicated message identifier"));
if (message_number != 0)
{
+ char *inbuf;
+ size_t inlen;
+ char *outbuf;
+ size_t outlen;
struct message_list *newp;
+ size_t this_line_len = strlen (this_line) + 1;
+
+ /* We need the conversion. */
+ if (cd_towc == (iconv_t) -1
+ && open_conversion (codeset, &cd_towc, &cd_tomb) != 0)
+ /* Something is wrong. */
+ goto out;
+
+ /* Convert to a wide character string. We have to
+ interpret escape sequences which will be impossible
+ without doing the conversion if the codeset of the
+ message is stateful. */
+ while (1)
+ {
+ inbuf = this_line;
+ inlen = this_line_len;
+ outbuf = (char *) wbuf;
+ outlen = wbufsize;
+
+ /* Flush the state. */
+ iconv (cd_towc, NULL, NULL, NULL, NULL);
+
+ iconv (cd_towc, &inbuf, &inlen, &outbuf, &outlen);
+ if (inlen == 0)
+ {
+ /* The string is converted. */
+ assert (outlen < wbufsize);
+ assert (wbuf[(wbufsize - outlen) / sizeof (wchar_t) - 1]
+ == L'\0');
+ break;
+ }
+
+ if (outlen != 0)
+ {
+ /* Something is wrong with this string, we ignore it. */
+ error_at_line (0, 0, fname, start_line, gettext ("\
+invalid character: message ignored"));
+ goto ignore;
+ }
+
+ /* The output buffer is too small. */
+ wbufsize *= 2;
+ wbuf = (wchar_t *) xrealloc (wbuf, wbufsize);
+ }
used = 1; /* Yes, we use the line. */
/* Strip quote characters, change escape sequences into
correct characters etc. */
- normalize_line (fname, start_line, this_line,
+ normalize_line (fname, start_line, cd_towc, wbuf,
current->quote_char);
+ /* Now the string is free of escape sequences. Convert it
+ back into a multibyte character string. First free the
+ memory allocated for the original string. */
+ obstack_free (&current->mem_pool, this_line);
+
+ /* Now fill in the new string. It should never happen that
+ the replaced string is longer than the original. */
+ inbuf = (char *) wbuf;
+ inlen = (wcslen (wbuf) + 1) * sizeof (wchar_t);
+
+ outlen = obstack_room (&current->mem_pool);
+ start_line = (char *) obstack_alloc (&current->mem_pool, outlen);
+ outbuf = start_line;
+
+ /* Flush the state. */
+ iconv (cd_tomb, NULL, NULL, NULL, NULL);
+
+ iconv (cd_tomb, &inbuf, &inlen, &outbuf, &outlen);
+ if (inlen != 0)
+ {
+ error_at_line (0, 0, fname, start_line,
+ gettext ("invalid line"));
+ goto ignore;
+ }
+ assert (outbuf[-1] == '\0');
+
+ /* Free the memory in the obstack we don't use. */
+ obstack_free (&current->mem_pool, outbuf);
+
newp = (struct message_list *) xmalloc (sizeof (*newp));
newp->number = message_number;
newp->message = this_line;
@@ -625,11 +773,20 @@ duplicated message identifier"));
gettext ("malformed line ignored"));
}
+ ignore:
/* We can save the memory for the line if it was not used. */
if (!used)
obstack_free (&current->mem_pool, this_line);
}
+ /* Close the conversion modules. */
+ iconv_close (cd_towc);
+ iconv_close (cd_tomb);
+ free (codeset);
+
+ out:
+ free (wbuf);
+
if (fp != stdin)
fclose (fp);
return current;
@@ -895,13 +1052,14 @@ find_set (struct catalog *current, int number)
/* Normalize given string *in*place* by processing escape sequences
and quote characters. */
static void
-normalize_line (const char *fname, size_t line, char *string, char quote_char)
+normalize_line (const char *fname, size_t line, iconv_t cd, wchar_t *string,
+ wchar_t quote_char)
{
int is_quoted;
- char *rp = string;
- char *wp = string;
+ wchar_t *rp = string;
+ wchar_t *wp = string;
- if (quote_char != '\0' && *rp == quote_char)
+ if (quote_char != L'\0' && *rp == quote_char)
{
is_quoted = 1;
++rp;
@@ -909,58 +1067,83 @@ normalize_line (const char *fname, size_t line, char *string, char quote_char)
else
is_quoted = 0;
- while (*rp != '\0')
+ while (*rp != L'\0')
if (*rp == quote_char)
/* We simply end the string when we find the first time an
not-escaped quote character. */
break;
- else if (*rp == '\\')
+ else if (*rp == L'\\')
{
++rp;
- if (quote_char != '\0' && *rp == quote_char)
+ if (quote_char != L'\0' && *rp == quote_char)
/* This is an extension to XPG. */
*wp++ = *rp++;
else
/* Recognize escape sequences. */
switch (*rp)
{
- case 'n':
- *wp++ = '\n';
+ case L'n':
+ *wp++ = L'\n';
++rp;
break;
- case 't':
- *wp++ = '\t';
+ case L't':
+ *wp++ = L'\t';
++rp;
break;
- case 'v':
- *wp++ = '\v';
+ case L'v':
+ *wp++ = L'\v';
++rp;
break;
- case 'b':
- *wp++ = '\b';
+ case L'b':
+ *wp++ = L'\b';
++rp;
break;
- case 'r':
- *wp++ = '\r';
+ case L'r':
+ *wp++ = L'\r';
++rp;
break;
- case 'f':
- *wp++ = '\f';
+ case L'f':
+ *wp++ = L'\f';
++rp;
break;
- case '\\':
- *wp++ = '\\';
+ case L'\\':
+ *wp++ = L'\\';
++rp;
break;
- case '0' ... '7':
+ case L'0' ... L'7':
{
- int number = *rp++ - '0';
- while (number <= (255 / 8) && *rp >= '0' && *rp <= '7')
+ int number;
+ char cbuf[2];
+ char *cbufptr;
+ size_t cbufin;
+ wchar_t wcbuf[2];
+ char *wcbufptr;
+ size_t wcbufin;
+
+ number = *rp++ - L'0';
+ while (number <= (255 / 8) && *rp >= L'0' && *rp <= L'7')
{
number *= 8;
- number += *rp++ - '0';
+ number += *rp++ - L'0';
}
- *wp++ = (char) number;
+
+ cbuf[0] = (char) number;
+ cbuf[1] = '\0';
+ cbufptr = cbuf;
+ cbufin = 2;
+
+ wcbufptr = (char *) wcbuf;
+ wcbufin = sizeof (wcbuf);
+
+ /* Flush the state. */
+ iconv (cd, NULL, NULL, NULL, NULL);
+
+ iconv (cd, &cbufptr, &cbufin, &wcbufptr, &wcbufin);
+ if (cbufptr != &cbuf[2] || (wchar_t *) wcbufptr != &wcbuf[2])
+ error_at_line (0, 0, fname, line,
+ gettext ("invalid escape sequence"));
+ else
+ *wp++ = wcbuf[0];
}
break;
default:
@@ -974,10 +1157,10 @@ normalize_line (const char *fname, size_t line, char *string, char quote_char)
/* If we saw a quote character at the beginning we expect another
one at the end. */
if (is_quoted && *rp != quote_char)
- error (0, 0, fname, line, gettext ("unterminated message"));
+ error_at_line (0, 0, fname, line, gettext ("unterminated message"));
/* Terminate string. */
- *wp = '\0';
+ *wp = L'\0';
return;
}
@@ -1069,3 +1252,30 @@ read_old (struct catalog *catalog, const char *file_name)
}
}
}
+
+
+static int
+open_conversion (const char *codeset, iconv_t *cd_towcp, iconv_t *cd_tombp)
+{
+ /* If the input file does not specify the codeset use the locale's. */
+ if (codeset == NULL)
+ {
+ setlocale (LC_ALL, "");
+ codeset = nl_langinfo (CODESET);
+ setlocale (LC_ALL, "C");
+ }
+
+ /* Get the conversion modules. */
+ *cd_towcp = iconv_open ("WCHAR_T", codeset);
+ *cd_tombp = iconv_open (codeset, "WCHAR_T");
+ if (*cd_towcp == (iconv_t) -1 || *cd_tombp == (iconv_t) -1)
+ {
+ error (0, 0, gettext ("conversion modules not available"));
+ if (*cd_towcp != (iconv_t) -1)
+ iconv_close (*cd_towcp);
+
+ return 1;
+ }
+
+ return 0;
+}