aboutsummaryrefslogtreecommitdiff
path: root/locale/programs/ld-collate.c
diff options
context:
space:
mode:
authorUlrich Drepper <drepper@redhat.com>1999-12-30 08:09:32 +0000
committerUlrich Drepper <drepper@redhat.com>1999-12-30 08:09:32 +0000
commit66ac0abe0341c5b3b1189c0ef9805ac931aecf6e (patch)
tree4871f21f421ea16d19ab5ee4ea6ca1ccec448996 /locale/programs/ld-collate.c
parentd876f5327985eac3bf3109e9429febc8a8954ff5 (diff)
downloadglibc-66ac0abe0341c5b3b1189c0ef9805ac931aecf6e.tar
glibc-66ac0abe0341c5b3b1189c0ef9805ac931aecf6e.tar.gz
glibc-66ac0abe0341c5b3b1189c0ef9805ac931aecf6e.tar.bz2
glibc-66ac0abe0341c5b3b1189c0ef9805ac931aecf6e.zip
Update.
1999-12-13 Andreas Jaeger <aj@suse.de> * resolv/resolv.h: Remove K&R compatibility. * resolv/res_libc.c: Move definition of _res after res_init, res_init should use the threaded specific context. * resolv/Makefile (+cflags): Remove -Wno-comment since it's not needed anymore. * locale/langinfo.h: Add constants for wide character collation data. * locale/categories.def: Add appropriate entries for collate entries. * locale/C-collate.c: Add initializers for new entries. * locale/programs/ld-collate.c: Implement output of wide character tables. * locale/programs/ld-ctype.c (allocate_arrays): Change algorithm to compute wide character table size a bit: it now gives up a bit of total table size for fewer levels.
Diffstat (limited to 'locale/programs/ld-collate.c')
-rw-r--r--locale/programs/ld-collate.c442
1 files changed, 434 insertions, 8 deletions
diff --git a/locale/programs/ld-collate.c b/locale/programs/ld-collate.c
index a7eb8083a4..ae132b4da5 100644
--- a/locale/programs/ld-collate.c
+++ b/locale/programs/ld-collate.c
@@ -106,6 +106,9 @@ struct element_t
/* Next element in multibyte output list. */
struct element_t *mbnext;
+
+ /* Next element in wide character output list. */
+ struct element_t *wcnext;
};
/* Special element value. */
@@ -171,6 +174,14 @@ struct locale_collate_t
/* Arrays with heads of the list for each of the leading bytes in
the multibyte sequences. */
struct element_t *mbheads[256];
+
+ /* Table size of wide character hash table. */
+ size_t plane_size;
+ size_t plane_cnt;
+
+ /* Arrays with heads of the list for each of the leading bytes in
+ the multibyte sequences. */
+ struct element_t **wcheads;
};
@@ -1381,6 +1392,9 @@ collate_finish (struct localedef_t *locale, struct charmap_t *charmap)
int need_undefined = 0;
struct section_list *sect;
int ruleidx;
+ int nr_wide_elems = 0;
+ size_t min_total;
+ size_t act_size;
if (collate == NULL)
{
@@ -1457,8 +1471,8 @@ collate_finish (struct localedef_t *locale, struct charmap_t *charmap)
be encoded to make it possible to emit the value as a byte
string. */
for (i = 0; i < nrules; ++i)
- mbact[i] = 3;
- wcact = 3;
+ mbact[i] = 2;
+ wcact = 2;
runp = collate->start;
while (runp != NULL)
{
@@ -1518,7 +1532,13 @@ collate_finish (struct localedef_t *locale, struct charmap_t *charmap)
}
if (runp->wcs != NULL)
- runp->wcorder = wcact++;
+ {
+ runp->wcorder = wcact++;
+
+ /* We take the opportunity to count the elements which have
+ wide characters. */
+ ++nr_wide_elems;
+ }
/* Up to the next entry. */
runp = runp->next;
@@ -1533,6 +1553,165 @@ collate_finish (struct localedef_t *locale, struct charmap_t *charmap)
collate->mbheads[i] = &collate->undefined;
}
+ /* Now to the wide character case. Here we have to find first a good
+ mapping function to get the wide range of wide character values
+ (0x00000000 to 0x7fffffff) to a managable table. This might take
+ some time so we issue a warning.
+
+ We use a very trivial hashing function to store the sparse
+ table. CH % TABSIZE is used as an index. To solve multiple hits
+ we have N planes. This guarantees a fixed search time for a
+ character [N / 2]. In the following code we determine the minimum
+ value for TABSIZE * N, where TABSIZE >= 256.
+
+ Some people complained that this algorithm takes too long. Well,
+ go on, improve it. But changing the step size is *not* an
+ option. Some people changed this to use only sizes of prime
+ numbers. Think again, do some math. We are looking for the
+ optimal solution, not something which works in general. Unless
+ somebody can provide a dynamic programming solution I think this
+ implementation is as good as it can get. */
+ if (nr_wide_elems > 512 && !be_quiet)
+ fputs (_("\
+Computing table size for collation table might take a while..."),
+ stderr);
+
+ min_total = UINT_MAX;
+ act_size = 256;
+
+ /* While we want to have a small total size we are willing to use a
+ little bit larger table if this reduces the number of layers.
+ Therefore we add a little penalty to the number of planes.
+ Maybe this constant has to be adjusted a bit. */
+#define PENALTY 128
+ do
+ {
+ size_t cnt[act_size];
+ struct element_t *elem[act_size];
+ size_t act_planes = 1;
+
+ memset (cnt, '\0', sizeof cnt);
+ memset (elem, '\0', sizeof elem);
+
+ runp = collate->start;
+ while (runp != NULL)
+ {
+ if (runp->wcs != NULL)
+ {
+ size_t nr = runp->wcs[0] % act_size;
+ struct element_t *elemp = elem[nr];
+
+ while (elemp != NULL)
+ {
+ if (elemp->wcs[0] == runp->wcs[0])
+ break;
+ elemp = elemp->wcnext;
+ }
+
+ if (elemp == NULL && ++cnt[nr] > act_planes)
+ {
+ act_planes = cnt[nr];
+
+ runp->wcnext = elem[nr];
+ elem[nr] = runp;
+
+ if ((act_size + PENALTY) * act_planes >= min_total)
+ break;
+ }
+ }
+
+ /* Up to the next entry. */
+ runp = runp->next;
+ }
+
+ if ((act_size + PENALTY) * act_planes < min_total)
+ {
+ min_total = (act_size + PENALTY) * act_planes;
+ collate->plane_size = act_size;
+ collate->plane_cnt = act_planes;
+ }
+
+ ++act_size;
+ }
+ while (act_size < min_total);
+
+ if (nr_wide_elems > 512 && !be_quiet)
+ fputs (_(" done\n"), stderr);
+
+ /* Now that we know how large the table has to be we are able to
+ allocate the array and start adding the characters to the lists
+ in the same way we did it for the multibyte characters. */
+ collate->wcheads = (struct element_t **)
+ obstack_alloc (&collate->mempool, (collate->plane_size
+ * collate->plane_cnt
+ * sizeof (struct element_t *)));
+ memset (collate->wcheads, '\0', (collate->plane_size
+ * collate->plane_cnt
+ * sizeof (struct element_t *)));
+
+ /* Start adding. */
+ runp = collate->start;
+ while (runp != NULL)
+ {
+ if (runp->wcs != NULL)
+ {
+ struct element_t **eptr;
+ size_t idx;
+
+ /* Find a free index. */
+ idx = runp->wcs[0] % collate->plane_size;
+ while (collate->wcheads[idx] != NULL)
+ {
+ /* Stop if this is an entry with the same starting character. */
+ if (collate->wcheads[idx]->wcs[0] == runp->wcs[0])
+ break;
+
+ idx += collate->plane_size;
+ }
+
+ /* Find the point where to insert in the list. */
+ eptr = &collate->wcheads[idx];
+ while (*eptr != NULL)
+ {
+ if ((*eptr)->nwcs < runp->nwcs)
+ break;
+
+ if ((*eptr)->nwcs == runp->nwcs)
+ {
+ int c = wmemcmp ((wchar_t *) (*eptr)->wcs,
+ (wchar_t *) runp->wcs, runp->nwcs);
+
+ if (c == 0)
+ {
+ /* This should not happen. It means that we have
+ to symbols with the same byte sequence. It is
+ of course an error. */
+ error_at_line (0, 0, (*eptr)->file, (*eptr)->line,
+ _("symbol `%s' has same encoding as"),
+ (*eptr)->name);
+ error_at_line (0, 0, runp->file, runp->line,
+ _("symbol `%s'"), runp->name);
+ goto dont_insertwc;
+ }
+ else if (c < 0)
+ /* Insert it here. */
+ break;
+ }
+
+ /* To the next entry. */
+ eptr = &(*eptr)->wcnext;
+ }
+
+ /* Set the pointers. */
+ runp->wcnext = *eptr;
+ *eptr = runp;
+ dont_insertwc:
+ }
+
+ /* Up to the next entry. */
+ runp = runp->next;
+ }
+
/* Now determine whether the UNDEFINED entry is needed and if yes,
whether it was defined. */
collate->undefined.used_in_level = need_undefined ? ~0ul : 0;
@@ -1620,6 +1799,45 @@ output_weight (struct obstack *pool, struct locale_collate_t *collate,
}
+static int32_t
+output_weightwc (struct obstack *pool, struct locale_collate_t *collate,
+ struct element_t *elem)
+{
+ size_t cnt;
+ int32_t retval;
+
+ /* Optimize the use of UNDEFINED. */
+ if (elem == &collate->undefined)
+ /* The weights are already inserted. */
+ return 0;
+
+ /* This byte can start exactly one collation element and this is
+ a single byte. We can directly give the index to the weights. */
+ retval = obstack_object_size (pool);
+
+ /* Construct the weight. */
+ for (cnt = 0; cnt < nrules; ++cnt)
+ {
+ int32_t buf[elem->weights[cnt].cnt];
+ int32_t i;
+
+ for (i = 0; i < elem->weights[cnt].cnt; ++i)
+ if (elem->weights[cnt].w[i] != NULL)
+ buf[i] = elem->weights[cnt].w[i]->wcorder;
+
+ /* And add the buffer content. */
+ if (sizeof (int) == sizeof (int32_t))
+ obstack_int_grow (pool, i);
+ else
+ obstack_grow (pool, &i, sizeof (int32_t));
+
+ obstack_grow (pool, buf, i * sizeof (int32_t));
+ }
+
+ return retval | ((elem->section->ruleidx & 0x7f) << 24);
+}
+
+
void
collate_output (struct localedef_t *locale, struct charmap_t *charmap,
const char *output_path)
@@ -1636,6 +1854,9 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
struct obstack extrapool;
struct obstack indirectpool;
struct section_list *sect;
+ uint32_t *names;
+ uint32_t *tablewc;
+ size_t table_size;
int i;
data.magic = LIMAGIC (LC_COLLATE);
@@ -1783,7 +2004,7 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
int i;
/* Now add first the initial byte sequence. */
- added = ((sizeof (int32_t) + 1 + 1 + 2 * (runp->nmbs - 1)
+ added = ((sizeof (int32_t) + 1 + 2 * (runp->nmbs - 1)
+ __alignof__ (int32_t) - 1)
& ~(__alignof__ (int32_t) - 1));
obstack_make_room (&extrapool, added);
@@ -1809,7 +2030,7 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
while (1)
{
if (sizeof (int32_t) == sizeof (int))
- obstack_int_grow_fast (&extrapool, weightidx);
+ obstack_int_grow (&extrapool, weightidx);
else
obstack_grow (&extrapool, &weightidx, sizeof (int32_t));
@@ -1833,7 +2054,7 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
weightidx = output_weight (&weightpool, collate, runp);
if (sizeof (int32_t) == sizeof (int))
- obstack_int_grow_fast (&extrapool, weightidx);
+ obstack_int_grow (&extrapool, weightidx);
else
obstack_grow (&extrapool, &weightidx, sizeof (int32_t));
}
@@ -1844,7 +2065,7 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
tested for). */
int i;
- added = ((sizeof (int32_t) + 1 + 1 + runp->nmbs - 1
+ added = ((sizeof (int32_t) + 1 + runp->nmbs - 1
+ __alignof__ (int32_t) - 1)
& ~(__alignof__ (int32_t) - 1));
obstack_make_room (&extrapool, added);
@@ -1900,7 +2121,7 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
}
}
- /* Now add the three tables. */
+ /* Now add the four tables. */
assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_TABLEMB));
iov[2 + cnt].iov_base = tablemb;
iov[2 + cnt].iov_len = sizeof (tablemb);
@@ -1926,6 +2147,211 @@ collate_output (struct localedef_t *locale, struct charmap_t *charmap,
++cnt;
+ /* Now the same for the wide character table. We need to store some
+ more information here. */
+ assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_HASH_SIZE));
+ iov[2 + cnt].iov_base = &collate->plane_size;
+ iov[2 + cnt].iov_len = sizeof (collate->plane_size);
+ idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
+ ++cnt;
+
+ assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_HASH_LAYERS));
+ iov[2 + cnt].iov_base = &collate->plane_cnt;
+ iov[2 + cnt].iov_len = sizeof (collate->plane_cnt);
+ idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
+ ++cnt;
+
+ /* Construct a table with the names. The size of the table is the same
+ as the table with the pointers. */
+ table_size = collate->plane_size * collate->plane_cnt;
+ names = (uint32_t *) alloca (table_size * sizeof (uint32_t));
+ for (ch = 0; ch < table_size; ++ch)
+ if (collate->wcheads[ch] == NULL)
+ names[ch] = 0;
+ else
+ names[ch] = collate->wcheads[ch]->wcs[0];
+
+ assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_NAMES));
+ iov[2 + cnt].iov_base = names;
+ iov[2 + cnt].iov_len = table_size * sizeof (uint32_t);
+ idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
+ ++cnt;
+
+ /* Generate the table. Walk through the lists of sequences
+ starting with the same byte and add them one after the other to
+ the table. In case we have more than one sequence starting with
+ the same byte we have to use extra indirection. */
+ tablewc = (uint32_t *) alloca (table_size * sizeof (uint32_t));
+ for (ch = 0; ch < table_size; ++ch)
+ if (collate->wcheads[ch] == NULL)
+ {
+ /* Set the entry to zero. */
+ tablewc[ch] = 0;
+ }
+ else if (collate->wcheads[ch]->wcnext == NULL
+ && collate->wcheads[ch]->nwcs == 1)
+ {
+ tablewc[ch] = output_weightwc (&weightpool, collate,
+ collate->wcheads[ch]);
+ }
+ else
+ {
+ /* As for the singlebyte table, we recognize sequences and
+ compress them. */
+ struct element_t *runp = collate->wcheads[ch];
+ struct element_t *lastp;
+
+ tablewc[ch] = -obstack_object_size (&extrapool);
+
+ do
+ {
+ /* Store the current index in the weight table. We know that
+ the current position in the `extrapool' is aligned on a
+ 32-bit address. */
+ int32_t weightidx;
+ int added;
+
+ /* Output the weight info. */
+ weightidx = output_weightwc (&weightpool, collate, runp);
+
+ /* Find out wether this is a single entry or we have more than
+ one consecutive entry. */
+ if (runp->wcnext != NULL
+ && runp->nwcs == runp->wcnext->nwcs
+ && wmemcmp ((wchar_t *) runp->wcs,
+ (wchar_t *)runp->wcnext->wcs, runp->nwcs - 1) == 0
+ && (runp->wcs[runp->nwcs - 1] + 1
+ == runp->wcnext->wcs[runp->nwcs - 1]))
+ {
+ int i;
+
+ /* Now add first the initial byte sequence. */
+ added = (1 + 1 + 2 * (runp->nwcs - 1)) * sizeof (int32_t);
+ if (sizeof (int32_t) == sizeof (int))
+ obstack_make_room (&extrapool, added);
+
+ /* More than one consecutive entry. We mark this by having
+ a negative index into the indirect table. */
+ if (sizeof (int32_t) == sizeof (int))
+ {
+ obstack_int_grow_fast (&extrapool,
+ obstack_object_size (&indirectpool)
+ / sizeof (int32_t));
+ obstack_int_grow_fast (&extrapool, runp->nwcs - 1);
+ }
+ else
+ {
+ int32_t i = (obstack_object_size (&indirectpool)
+ / sizeof (int32_t));
+ obstack_grow (&extrapool, &i, sizeof (int32_t));
+ i = runp->nwcs - 1;
+ obstack_grow (&extrapool, &i, sizeof (int32_t));
+ }
+ for (i = 1; i < runp->nwcs; ++i)
+ if (sizeof (int32_t) == sizeof (int))
+ obstack_int_grow_fast (&extrapool, runp->wcs[i]);
+ else
+ obstack_grow (&extrapool, &runp->wcs[i], sizeof (int32_t));
+
+ /* Now find the end of the consecutive sequence and
+ add all the indeces in the indirect pool. */
+ while (1)
+ {
+ if (sizeof (int32_t) == sizeof (int))
+ obstack_int_grow (&extrapool, weightidx);
+ else
+ obstack_grow (&extrapool, &weightidx, sizeof (int32_t));
+
+ runp = runp->next;
+ if (runp->wcnext == NULL
+ || runp->nwcs != runp->wcnext->nwcs
+ || wmemcmp ((wchar_t *) runp->wcs,
+ (wchar_t *) runp->wcnext->wcs,
+ runp->nwcs - 1) != 0
+ || (runp->wcs[runp->nwcs - 1] + 1
+ != runp->wcnext->wcs[runp->nwcs - 1]))
+ break;
+
+ /* Insert the weight. */
+ weightidx = output_weightwc (&weightpool, collate, runp);
+ }
+
+ /* And add the end byte sequence. Without length this
+ time. */
+ for (i = 1; i < runp->nwcs; ++i)
+ if (sizeof (int32_t) == sizeof (int))
+ obstack_int_grow (&extrapool, runp->wcs[i]);
+ else
+ obstack_grow (&extrapool, &runp->wcs[i], sizeof (int32_t));
+
+ weightidx = output_weightwc (&weightpool, collate, runp);
+ if (sizeof (int32_t) == sizeof (int))
+ obstack_int_grow (&extrapool, weightidx);
+ else
+ obstack_grow (&extrapool, &weightidx, sizeof (int32_t));
+ }
+ else
+ {
+ /* A single entry. Simply add the index and the length and
+ string (except for the first character which is already
+ tested for). */
+ int i;
+
+ added = (1 + 1 + runp->nwcs - 1) * sizeof (int32_t);
+ if (sizeof (int) == sizeof (int32_t))
+ obstack_make_room (&extrapool, added);
+
+ if (sizeof (int32_t) == sizeof (int))
+ {
+ obstack_int_grow_fast (&extrapool, weightidx);
+ obstack_int_grow_fast (&extrapool, runp->nwcs - 1);
+ }
+ else
+ {
+ int32_t l = runp->nwcs - 1;
+ obstack_grow (&extrapool, &weightidx, sizeof (int32_t));
+ obstack_grow (&extrapool, &l, sizeof (int32_t));
+ }
+ for (i = 1; i < runp->nwcs; ++i)
+ if (sizeof (int32_t) == sizeof (int))
+ obstack_int_grow_fast (&extrapool, runp->wcs[i]);
+ else
+ obstack_grow (&extrapool, &runp->wcs[i], sizeof (int32_t));
+ }
+
+ /* Next entry. */
+ lastp = runp;
+ runp = runp->wcnext;
+ }
+ while (runp != NULL);
+ }
+
+ /* Now add the four tables. */
+ assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_TABLEWC));
+ iov[2 + cnt].iov_base = tablewc;
+ iov[2 + cnt].iov_len = table_size * sizeof (int32_t);
+ idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
+ ++cnt;
+
+ assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_WEIGHTWC));
+ iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
+ iov[2 + cnt].iov_base = obstack_finish (&weightpool);
+ idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
+ ++cnt;
+
+ assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_EXTRAWC));
+ iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
+ iov[2 + cnt].iov_base = obstack_finish (&extrapool);
+ idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
+ ++cnt;
+
+ assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_INDIRECTWC));
+ iov[2 + cnt].iov_len = obstack_object_size (&indirectpool);
+ iov[2 + cnt].iov_base = obstack_finish (&indirectpool);
+ idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
+ ++cnt;
+
+
assert (cnt == _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE));
write_locale_data (output_path, "LC_COLLATE", 2 + cnt, iov);