aboutsummaryrefslogtreecommitdiff
path: root/string/strxfrm.c
diff options
context:
space:
mode:
Diffstat (limited to 'string/strxfrm.c')
-rw-r--r--string/strxfrm.c525
1 files changed, 320 insertions, 205 deletions
diff --git a/string/strxfrm.c b/string/strxfrm.c
index 2a3a8a9032..344e65b957 100644
--- a/string/strxfrm.c
+++ b/string/strxfrm.c
@@ -17,282 +17,397 @@
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
+#include <langinfo.h>
#include <stddef.h>
+#include <stdint.h>
#include <stdlib.h>
#include <string.h>
-#ifndef WIDE_VERSION
-# define STRING_TYPE char
-# define USTRING_TYPE unsigned char
-# define L_(Ch) Ch
-# ifdef USE_IN_EXTENDED_LOCALE_MODEL
-# define STRXFRM __strxfrm_l
-# else
-# define STRXFRM strxfrm
-# endif
-# define STRLEN strlen
-# define STPNCPY __stpncpy
-#endif
+#include "../locale/localeinfo.h"
-#ifndef USE_IN_EXTENDED_LOCALE_MODEL
-size_t
-STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n)
+#ifdef USE_IN_EXTENDED_LOCALE_MODEL
+# define STRXFRM __strxfrm_l
#else
-size_t
-STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
+# define STRXFRM strxfrm
#endif
-{
- if (n != 0)
- STPNCPY (dest, src, n);
- return STRLEN (src);
-}
-
-#if 0
-/* Include the shared helper functions. `strxfrm'/`wcsxfrm' also use
- these functions. */
-#include "../locale/weight.h"
-
-#ifndef WIDE_VERSION
-/* Write 32 bit value UTF-8 encoded but only if enough space is left. */
-static __inline size_t
-print_val (u_int32_t value, char *dest, size_t max, size_t act)
+/* These are definitions used by some of the functions for handling
+ UTF-8 encoding below. */
+static const uint32_t encoding_mask[] =
{
- char tmp[6];
- int idx = 0;
+ ~0x7ff, ~0xffff, ~0x1fffff, ~0x3ffffff
+};
- if (value < 0x80)
- tmp[idx++] = (char) value;
- else
- {
- tmp[idx++] = '\x80' + (char) (value & 0x3f);
- value >>= 6;
-
- if (value < 0x20)
- tmp[idx++] = '\xc0' + (char) value;
- else
- {
- tmp[idx++] = '\x80' + (char) (value & 0x3f);
- value >>= 6;
-
- if (value < 0x10)
- tmp[idx++] = '\xe0' + (char) value;
- else
- {
- tmp[idx++] = '\x80' + (char) (value & 0x3f);
- value >>= 6;
-
- if (value < 0x08)
- tmp[idx++] = '\xf0' + (char) value;
- else
- {
- tmp[idx++] = '\x80' + (char) (value & 0x3f);
- value >>= 6;
-
- if (value < 0x04)
- tmp[idx++] = '\xf8' + (char) value;
- else
- {
- tmp[idx++] = '\x80' + (char) (value & 0x3f);
- tmp[idx++] = '\xfc' + (char) (value >> 6);
- }
- }
- }
- }
- }
+static const unsigned char encoding_byte[] =
+{
+ 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
+};
- while (idx-- > 0)
- {
- if (act < max)
- dest[act] = tmp[idx];
- ++act;
- }
- return act;
-}
-#else
-static __inline size_t
-print_val (u_int32_t value, wchar_t *dest, size_t max, size_t act)
+/* We need UTF-8 encoding of numbers. */
+static inline int
+utf8_encode (char *buf, int val)
{
- /* We cannot really assume wchar_t is 32 bits wide. But it is for
- GCC and so we don't do much optimization for the other case. */
- if (sizeof (wchar_t) == 4)
+ char *startp = buf;
+ int retval;
+
+ if (val < 0x80)
{
- if (act < max)
- dest[act] = (wchar_t) value;
- ++act;
+ *buf++ = (char) val;
+ retval = 1;
}
else
{
- wchar_t tmp[3];
- size_t idx = 0;
+ int step;
- if (value < 0x8000)
- tmp[idx++] = (wchar_t) act;
- else
- {
- tmp[idx++] = (wchar_t) (0x8000 + (value & 0x3fff));
- value >>= 14;
- if (value < 0x2000)
- tmp[idx++] = (wchar_t) (0xc000 + value);
- else
- {
- tmp[idx++] = (wchar_t) (0x8000 + (value & 0x3fff));
- value >>= 14;
- tmp[idx++] = (wchar_t) (0xe000 + value);
- }
- }
- while (idx-- > 0)
+ for (step = 2; step < 6; ++step)
+ if ((val & encoding_mask[step - 2]) == 0)
+ break;
+ retval = step;
+
+ *buf = encoding_byte[step - 2];
+ --step;
+ do
{
- if (act < max)
- dest[act] = tmp[idx];
- ++act;
+ buf[step] = 0x80 | (val & 0x3f);
+ val >>= 6;
}
+ while (--step > 0);
+ *buf |= val;
}
- return act;
+
+ return buf - startp;
}
-#endif
-/* Transform SRC into a form such that the result of strcmp
- on two strings that have been transformed by strxfrm is
- the same as the result of strcoll on the two strings before
- their transformation. The transformed string is put in at
- most N characters of DEST and its length is returned. */
#ifndef USE_IN_EXTENDED_LOCALE_MODEL
size_t
-STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n)
+STRXFRM (char *dest, const char *src, size_t n)
#else
size_t
-STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l)
+STRXFRM (char *dest, const char *src, size_t n, __locale_t l)
#endif
{
#ifdef USE_IN_EXTENDED_LOCALE_MODEL
struct locale_data *current = l->__locales[LC_COLLATE];
-# if BYTE_ORDER == BIG_ENDIAN
- const u_int32_t *collate_table = (const u_int32_t *)
- current->values[_NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].string;
- const u_int32_t *collate_extra = (const u_int32_t *)
- current->values[_NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].string;
-# elif BYTE_ORDER == LITTLE_ENDIAN
- const u_int32_t *collate_table = (const u_int32_t *)
- current->values[_NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].string;
- const u_int32_t *collate_extra = (const u_int32_t *)
- current->values[_NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].string;
-# else
-# error bizarre byte order
-# endif
+ uint_fast32_t nrules = *((uint32_t *) current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].string);
+#else
+ uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
#endif
- weight_t *forw = NULL;
- weight_t *backw = NULL;
- size_t pass;
- size_t written;
-
- /* If the current locale does not specify locale data we use normal
- 8-bit string comparison. */
- if (collate_nrules == 0)
+ /* We don't assign the following values right away since it might be
+ unnecessary in case there are no rules. */
+ const unsigned char *rulesets;
+ const int32_t *table;
+ const unsigned char *weights;
+ const unsigned char *extra;
+ const int32_t *indirect;
+ uint_fast32_t pass;
+ size_t needed;
+ const unsigned char *usrc;
+ size_t srclen = strlen (src);
+ int32_t *idxarr;
+ unsigned char *rulearr;
+ size_t idxmax;
+ size_t idxcnt;
+ int use_malloc = 0;
+
+#include "../locale/weight.h"
+
+ if (nrules == 0)
{
if (n != 0)
- STPNCPY (dest, src, n);
+ __stpncpy (dest, src, n);
- return STRLEN (src);
+ return srclen;
}
+#ifdef USE_IN_EXTENDED_LOCALE_MODEL
+ rulesets = (const unsigned char *)
+ current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
+ table = (const int32_t *)
+ current->values[_NL_ITEM_INDEX (_NL_COLLATE_TABLEMB)].string;
+ weights = (const unsigned char *)
+ current->values[_NL_ITEM_INDEX (_NL_COLLATE_WEIGHTMB)].string;
+ extra = (const unsigned char *)
+ current->values[_NL_ITEM_INDEX (_NL_COLLATE_EXTRAMB)].string;
+ indirect = (const int32_t *)
+ current->values[_NL_ITEM_INDEX (_NL_COLLATE_INDIRECTMB)].string;
+#else
+ rulesets = (const unsigned char *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_RULESETS);
+ table = (const int32_t *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
+ weights = (const unsigned char *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
+ extra = (const unsigned char *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
+ indirect = (const int32_t *)
+ _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
+#endif
+
/* Handle an empty string as a special case. */
- if (*src == '\0')
+ if (srclen == 0)
{
if (n != 0)
- *dest = '\0';
+ *dest = '\0';
return 1;
}
- /* Get full information about the string. This means we get
- information for all passes in a special data structure. */
- get_string (src, forw, backw);
+ /* We need the elements of the string as unsigned values since they
+ are used as indeces. */
+ usrc = (const unsigned char *) src;
+
+ /* Perform the first pass over the string and while doing this find
+ and store the weights for each character. Since we want this to
+ be as fast as possible we are using `alloca' to store the temporary
+ values. But since there is no limit on the length of the string
+ we have to use `malloc' if the string is too long. We should be
+ very conservative here. */
+ if (srclen >= 16384)
+ {
+ idxarr = (int32_t *) malloc (srclen * (sizeof (int32_t) + 1));
+ rulearr = (unsigned char *) &idxarr[srclen];
+
+ if (idxarr == NULL)
+ /* No memory. Well, go with the stack then.
+
+ XXX Once this implementation is stable we will handle this
+ differently. Instead of precomputing the indeces we will
+ do this in time. This means, though, that this happens for
+ every pass again. */
+ goto try_stack;
+ use_malloc = 1;
+ }
+ else
+ {
+ try_stack:
+ idxarr = (int32_t *) alloca (srclen * sizeof (int32_t));
+ rulearr = (unsigned char *) alloca (srclen);
+ }
- /* Now we have all the information. In at most the given number of
- passes we can finally decide about the order. */
- written = 0;
- for (pass = 0; pass < collate_nrules; ++pass)
+ idxmax = 0;
+ do
{
- int forward = (collate_rules[pass] & sort_forward) != 0;
- const weight_t *run = forward ? forw : backw;
- int idx = forward ? 0 : run->data[pass].number - 1;
+ int32_t tmp = findidx (&usrc);
+ rulearr[idxmax] = tmp >> 24;
+ idxarr[idxmax] = tmp & 0x80ffffff;
- while (1)
+ ++idxmax;
+ }
+ while (*usrc != '\0');
+
+ /* Now the passes over the weights. We now use the indeces we found
+ before. */
+ needed = 0;
+ for (pass = 0; pass < nrules; ++pass)
+ {
+ size_t backw_stop = ~0ul;
+ int rule = rulesets[rulearr[0] * nrules + pass];
+ /* We assume that if a rule has defined `position' in one section
+ this is true for all of them. */
+ int position = rule & sort_position;
+
+ if (position == 0)
{
- int ignore = 0;
- u_int32_t w = 0;
-
- /* Here we have to check for IGNORE entries. If these are
- found we count them and go on with he next value. */
- while (run != NULL
- && ((w = run->data[pass].value[idx])
- == (u_int32_t) IGNORE_CHAR))
+ for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
{
- ++ignore;
- if (forward
- ? ++idx >= run->data[pass].number
- : --idx < 0)
+ if ((rule & sort_forward) != 0)
{
- weight_t *nextp = forward ? run->next : run->prev;
- if (nextp == NULL)
+ size_t len;
+
+ if (backw_stop != ~0ul)
{
- w = 0;
- /* No more non-INGOREd elements means lowest
- possible value. */
- ignore = -1;
+ /* Handle the pushed elements now. */
+ size_t backw;
+
+ for (backw = idxcnt - 1; backw >= backw_stop; --backw)
+ {
+ len = weights[idxarr[backw]++];
+
+ if (needed + len < n)
+ while (len-- > 0)
+ dest[needed++] = weights[idxarr[backw]++];
+ else
+ {
+ /* No more characters fit into the buffer. */
+ needed += len;
+ idxarr[backw] += len;
+ }
+ }
+
+ backw_stop = ~0ul;
}
+
+ /* Now handle the forward element. */
+ len = weights[idxarr[idxcnt]++];
+ if (needed + len < n)
+ while (len-- > 0)
+ dest[needed++] = weights[idxarr[idxcnt]++];
else
- idx = forward ? 0 : nextp->data[pass].number - 1;
- run = nextp;
+ {
+ /* No more characters fit into the buffer. */
+ needed += len;
+ idxarr[idxcnt] += len;
+ }
+ }
+ else
+ {
+ /* Remember where the backwards series started. */
+ if (backw_stop == ~0ul)
+ backw_stop = idxcnt;
}
+
+ rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
}
- /* Stop if all characters are processed. */
- if (run == NULL)
- break;
- /* Now we have information of the number of ignored weights
- and the value of the next weight. We have to add 2
- because 0 means EOS and 1 is the intermediate string end. */
- if ((collate_rules[pass] & sort_position) != 0)
- written = print_val (ignore + 2, dest, n, written);
+ if (backw_stop != ~0ul)
+ {
+ /* Handle the pushed elements now. */
+ size_t backw;
- if (w != 0)
- written = print_val (w, dest, n, written);
+ for (backw = idxcnt - 1; backw >= backw_stop; --backw)
+ {
+ size_t len = weights[idxarr[backw]++];
- /* We have to increment the index counters. */
- if (forward)
+ if (needed + len < n)
+ while (len-- > 0)
+ dest[needed++] = weights[idxarr[backw]++];
+ else
+ {
+ /* No more characters fit into the buffer. */
+ needed += len;
+ idxarr[backw] += len;
+ }
+ }
+ }
+ }
+ else
+ {
+ int val = 1;
+ char buf[7];
+ size_t buflen;
+ size_t i;
+
+ for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
{
- if (++idx >= run->data[pass].number)
+ if ((rule & sort_forward) != 0)
+ {
+ size_t len;
+
+ if (backw_stop != ~0ul)
+ {
+ /* Handle the pushed elements now. */
+ size_t backw;
+
+ for (backw = idxcnt - 1; backw >= backw_stop; --backw)
+ {
+ len = weights[idxarr[backw]++];
+ if (len != 0)
+ {
+ buflen = utf8_encode (buf, val);
+ if (needed + buflen + len < n)
+ {
+ for (i = 0; i < buflen; ++i)
+ dest[needed + i] = buf[i];
+ for (i = 0; i < len; ++i)
+ dest[needed + buflen + i] =
+ weights[idxarr[backw] + i];
+ }
+ idxarr[backw] += len;
+ needed += buflen + len;
+ val = 1;
+ }
+ else
+ ++val;
+ }
+
+ backw_stop = ~0ul;
+ }
+
+ /* Now handle the forward element. */
+ len = weights[idxarr[idxcnt]++];
+ if (len != 0)
+ {
+ buflen = utf8_encode (buf, val);
+ if (needed + buflen + len < n)
+ {
+ for (i = 0; i < buflen; ++i)
+ dest[needed + i] = buf[i];
+ for (i = 0; i < len; ++i)
+ dest[needed + buflen + i] =
+ weights[idxarr[idxcnt] + i];
+ }
+ idxarr[idxcnt] += len;
+ needed += buflen + len;
+ val = 1;
+ }
+ else
+ /* Note that we don't have to increment `idxarr[idxcnt]'
+ since the length is zero. */
+ ++val;
+ }
+ else
{
- run = run->next;
- idx = 0;
+ /* Remember where the backwards series started. */
+ if (backw_stop == ~0ul)
+ backw_stop = idxcnt;
}
+
+ rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
}
- else
+
+ if (backw_stop != ~0)
{
- if (--idx < 0)
+ /* Handle the pushed elements now. */
+ size_t backw;
+
+ for (backw = idxmax - 1; backw >= backw_stop; --backw)
{
- run = run->prev;
- if (run != NULL)
- idx = run->data[pass].number - 1;
+ size_t len = weights[idxarr[backw]++];
+ if (len != 0)
+ {
+ buflen = utf8_encode (buf, val);
+ if (needed + buflen + len < n)
+ {
+ for (i = 0; i < buflen; ++i)
+ dest[needed + i] = buf[i];
+ for (i = 0; i < len; ++i)
+ dest[needed + buflen + i] =
+ weights[idxarr[backw] + i];
+ }
+ idxarr[backw] += len;
+ needed += buflen + len;
+ val = 1;
+ }
+ else
+ ++val;
}
}
}
- /* Write marker for end of word. */
- if (pass + 1 < collate_nrules)
- written = print_val (1, dest, n, written);
+ /* Finally store the byte to separate the passes or terminate
+ the string. */
+ if (needed < n)
+ dest[needed] = pass + 1 < nrules ? '\1' : '\0';
+ ++needed;
+ }
+
+ /* This is a little optimization: many collation specifications have
+ a `position' rule at the end and if no non-ignored character
+ is found the last \1 byte is immediately followed by a \0 byte
+ signalling this. We can avoid the \1 byte(s). */
+ if (needed > 2 && dest[needed - 2] == '\1')
+ {
+ /* Remove the \1 byte. */
+ --needed;
+ dest[needed - 1] = '\0';
}
- /* Terminate string. */
- if (written < n)
- dest[written] = L_('\0');
+ /* Free the memory if needed. */
+ if (use_malloc)
+ free (idxarr);
- /* Return length without counting the terminating '\0'. */
- return written;
+ return needed;
}
-#endif