/* Simple transformations functions. Copyright (C) 1997, 1998 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 1997. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with the GNU C Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include #include #include #include #include #include #include #include #ifndef EILSEQ # define EILSEQ EINVAL #endif /* These are definitions used by some of the functions for handling UTF-8 encoding below. */ static const wchar_t encoding_mask[] = { ~0x7ff, ~0xffff, ~0x1fffff, ~0x3ffffff }; static const unsigned char encoding_byte[] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc }; int __gconv_transform_dummy (struct gconv_step *step, struct gconv_step_data *data, const char *inbuf, size_t *inlen, size_t *written, int do_flush) { size_t do_write; /* We have no stateful encoding. So we don't have to do anything special. */ if (do_flush) do_write = 0; else { do_write = MIN (*inlen, data->outbufsize - data->outbufavail); memcpy (data->outbuf, inbuf, do_write); *inlen -= do_write; data->outbufavail += do_write; } /* ### TODO Actually, this number must be devided according to the size of the input charset. I.e., if the input is in UCS4 the number of copied bytes must be divided by 4. */ if (written != NULL) *written = do_write; return GCONV_OK; } /* Transform from the internal, UCS4-like format, to UCS4. The difference between the internal ucs4 format and the real UCS4 format is, if any, the endianess. The Unicode/ISO 10646 says that unless some higher protocol specifies it differently, the byte order is big endian.*/ int __gconv_transform_internal_ucs4 (struct gconv_step *step, struct gconv_step_data *data, const char *inbuf, size_t *inlen, size_t *written, int do_flush) { struct gconv_step *next_step = step + 1; struct gconv_step_data *next_data = data + 1; gconv_fct fct = next_step->fct; size_t do_write = 0; int result; /* If the function is called with no input this means we have to reset to the initial state. The possibly partly converted input is dropped. */ if (do_flush) { /* Clear the state. */ memset (data->statep, '\0', sizeof (mbstate_t)); /* Call the steps down the chain if there are any. */ if (data->is_last) result = GCONV_OK; else { struct gconv_step *next_step = step + 1; struct gconv_step_data *next_data = data + 1; result = (*fct) (next_step, next_data, NULL, 0, written, 1); /* Clear output buffer. */ data->outbufavail = 0; } } else { int save_errno = errno; result = GCONV_OK; do { size_t n_convert = (MIN (*inlen, (data->outbufsize - data->outbufavail)) / sizeof (wchar_t)); #if __BYTE_ORDER == __LITTLE_ENDIAN /* Sigh, we have to do some real work. */ wchar_t *outbuf = (wchar_t *) &data->outbuf[data->outbufavail]; size_t cnt; for (cnt = 0; cnt < n_convert; ++cnt) outbuf[cnt] = bswap_32 (((wchar_t *) inbuf)[cnt]); #elif __BYTE_ORDER == __BIG_ENDIAN /* Simply copy the data. */ memcpy (&data->outbuf[data->outbufsize], inbuf, n_convert * sizeof (wchar_t)); #else # error "This endianess is not supported." #endif *inlen -= n_convert * sizeof (wchar_t); inbuf += n_convert * sizeof (wchar_t); data->outbufavail += n_convert * sizeof (wchar_t); do_write += n_convert; if (*inlen > 0 && *inlen < sizeof (wchar_t)) { /* We have an incomplete character at the end. */ result = GCONV_INCOMPLETE_INPUT; break; } if (data->is_last) { /* This is the last step. */ result = (*inlen < sizeof (wchar_t) ? GCONV_EMPTY_INPUT : GCONV_FULL_OUTPUT); break; } /* Status so far. */ result = GCONV_EMPTY_INPUT; if (data->outbufavail > 0) { /* Call the functions below in the chain. */ size_t newavail = data->outbufavail; result = (*fct) (next_step, next_data, data->outbuf, &newavail, written, 0); /* Correct the output buffer. */ if (newavail != data->outbufavail && newavail > 0) { memmove (data->outbuf, &data->outbuf[data->outbufavail - newavail], newavail); data->outbufavail = newavail; } } } while (*inlen >= sizeof (wchar_t) && result == GCONV_EMPTY_INPUT); __set_errno (save_errno); } if (written != NULL && data->is_last) *written = do_write; return result; } /* Convert from ISO 646-IRV to the internal (UCS4-like) format. */ int __gconv_transform_ascii_internal (struct gconv_step *step, struct gconv_step_data *data, const char *inbuf, size_t *inlen, size_t *written, int do_flush) { struct gconv_step *next_step = step + 1; struct gconv_step_data *next_data = data + 1; gconv_fct fct = next_step->fct; size_t do_write = 0; int result; /* If the function is called with no input this means we have to reset to the initial state. The possibly partly converted input is dropped. */ if (do_flush) { /* Clear the state. */ memset (data->statep, '\0', sizeof (mbstate_t)); /* Call the steps down the chain if there are any. */ if (data->is_last) result = GCONV_OK; else { struct gconv_step *next_step = step + 1; struct gconv_step_data *next_data = data + 1; result = (*fct) (next_step, next_data, NULL, 0, written, 1); /* Clear output buffer. */ data->outbufavail = 0; } } else { const unsigned char *newinbuf = inbuf; int save_errno = errno; result = GCONV_OK; do { size_t actually = 0; size_t cnt = 0; while (data->outbufavail + sizeof (wchar_t) <= data->outbufsize && cnt < *inlen) { if (*newinbuf > '\x7f') { /* This is no correct ANSI_X3.4-1968 character. */ result = GCONV_ILLEGAL_INPUT; break; } /* It's an one byte sequence. */ *(wchar_t *) &data->outbuf[data->outbufavail] = (wchar_t) *newinbuf; data->outbufavail += sizeof (wchar_t); ++actually; ++newinbuf; ++cnt; } /* Remember how much we converted. */ do_write += cnt * sizeof (wchar_t); *inlen -= cnt; /* Check whether an illegal character appeared. */ if (result != GCONV_OK) break; if (data->is_last) { /* This is the last step. */ result = (*inlen == 0 ? GCONV_EMPTY_INPUT : GCONV_FULL_OUTPUT); break; } /* Status so far. */ result = GCONV_EMPTY_INPUT; if (data->outbufavail > 0) { /* Call the functions below in the chain. */ size_t newavail = data->outbufavail; result = (*fct) (next_step, next_data, data->outbuf, &newavail, written, 0); /* Correct the output buffer. */ if (newavail != data->outbufavail && newavail > 0) { memmove (data->outbuf, &data->outbuf[data->outbufavail - newavail], newavail); data->outbufavail = newavail; } } } while (*inlen > 0 && result == GCONV_EMPTY_INPUT); __set_errno (save_errno); } if (written != NULL && data->is_last) *written = do_write / sizeof (wchar_t); return result; } /* Convert from ISO 10646/UCS to ISO 646-IRV. */ int __gconv_transform_internal_ascii (struct gconv_step *step, struct gconv_step_data *data, const char *inbuf, size_t *inlen, size_t *written, int do_flush) { struct gconv_step *next_step = step + 1; struct gconv_step_data *next_data = data + 1; gconv_fct fct = next_step->fct; size_t do_write; int result; /* If the function is called with no input this means we have to reset to the initial state. The possibly partly converted input is dropped. */ if (do_flush) { /* Clear the state. */ memset (data->statep, '\0', sizeof (mbstate_t)); do_write = 0; /* Call the steps down the chain if there are any. */ if (data->is_last) result = GCONV_OK; else { struct gconv_step *next_step = step + 1; struct gconv_step_data *next_data = data + 1; result = (*fct) (next_step, next_data, NULL, 0, written, 1); /* Clear output buffer. */ data->outbufavail = 0; } } else { const wchar_t *newinbuf = (const wchar_t *) inbuf; int save_errno = errno; do_write = 0; result = GCONV_OK; do { size_t actually = 0; size_t cnt = 0; while (data->outbufavail < data->outbufsize && cnt + 3 < *inlen) { if (*newinbuf < L'\0' || *newinbuf > L'\x7f') { /* This is no correct ANSI_X3.4-1968 character. */ result = GCONV_ILLEGAL_INPUT; break; } /* It's an one byte sequence. */ data->outbuf[data->outbufavail++] = (char) *newinbuf; ++actually; ++newinbuf; cnt += sizeof (wchar_t); } /* Remember how much we converted. */ do_write += cnt / sizeof (wchar_t); *inlen -= cnt; /* Check whether an illegal character appeared. */ if (result != GCONV_OK) break; /* Check for incomplete input. */ if (*inlen > 0 && *inlen < sizeof (wchar_t)) { /* We have an incomplete character at the end. */ result = GCONV_INCOMPLETE_INPUT; break; } if (data->is_last) { /* This is the last step. */ result = *inlen == 0 ? GCONV_EMPTY_INPUT : GCONV_FULL_OUTPUT; break; } /* Status so far. */ result = GCONV_EMPTY_INPUT; if (data->outbufavail > 0) { /* Call the functions below in the chain. */ size_t newavail = data->outbufavail; result = (*fct) (next_step, next_data, data->outbuf, &newavail, written, 0); /* Correct the output buffer. */ if (newavail != data->outbufavail && newavail > 0) { memmove (data->outbuf, &data->outbuf[data->outbufavail - newavail], newavail); data->outbufavail = newavail; } } } while (*inlen > 0 && result == GCONV_EMPTY_INPUT); __set_errno (save_errno); } if (written != NULL && data->is_last) *written = do_write; return result; } int __gconv_transform_internal_utf8 (struct gconv_step *step, struct gconv_step_data *data, const char *inbuf, size_t *inlen, size_t *written, int do_flush) { struct gconv_step *next_step = step + 1; struct gconv_step_data *next_data = data + 1; gconv_fct fct = next_step->fct; size_t do_write; int result; /* If the function is called with no input this means we have to reset to the initial state. The possibly partly converted input is dropped. */ if (do_flush) { /* Clear the state. */ memset (data->statep, '\0', sizeof (mbstate_t)); do_write = 0; /* Call the steps down the chain if there are any. */ if (data->is_last) result = GCONV_OK; else { struct gconv_step *next_step = step + 1; struct gconv_step_data *next_data = data + 1; result = (*fct) (next_step, next_data, NULL, 0, written, 1); /* Clear output buffer. */ data->outbufavail = 0; } } else { const wchar_t *newinbuf = (const wchar_t *) inbuf; int save_errno = errno; do_write = 0; result = GCONV_OK; do { size_t cnt = 0; while (data->outbufavail < data->outbufsize && cnt * sizeof (wchar_t) + 3 < *inlen) { wchar_t wc = newinbuf[cnt]; if (wc < 0 && wc > 0x7fffffff) { /* This is no correct ISO 10646 character. */ result = GCONV_ILLEGAL_INPUT; break; } if (wc < 0x80) /* It's an one byte sequence. */ data->outbuf[data->outbufavail++] = (char) wc; else { size_t step; size_t start; for (step = 2; step < 6; ++step) if ((wc & encoding_mask[step - 2]) == 0) break; if (data->outbufavail + step >= data->outbufsize) /* Too long. */ break; start = data->outbufavail; data->outbufavail += step; data->outbuf[start] = encoding_byte[step - 2]; --step; do { data->outbuf[start + step] = 0x80 | (wc & 0x3f); wc >>= 6; } while (--step > 0); data->outbuf[start] |= wc; } ++cnt; } /* Remember how much we converted. */ do_write += cnt; *inlen -= cnt * sizeof (wchar_t); newinbuf += cnt; /* Check whether an illegal character appeared. */ if (result != GCONV_OK) break; /* Check for incomplete input. */ if (*inlen > 0 && *inlen < sizeof (wchar_t)) { /* We have an incomplete character at the end. */ result = GCONV_INCOMPLETE_INPUT; break; } if (data->is_last) { /* This is the last step. */ result = *inlen == 0 ? GCONV_EMPTY_INPUT : GCONV_FULL_OUTPUT; break; } /* Status so far. */ result = GCONV_EMPTY_INPUT; if (data->outbufavail > 0) { /* Call the functions below in the chain. */ size_t newavail = data->outbufavail; result = (*fct) (next_step, next_data, data->outbuf, &newavail, written, 0); /* Correct the output buffer. */ if (newavail != data->outbufavail && newavail > 0) { memmove (data->outbuf, &data->outbuf[data->outbufavail - newavail], newavail); data->outbufavail = newavail; } } } while (*inlen > 0 && result == GCONV_EMPTY_INPUT); __set_errno (save_errno); } if (written != NULL && data->is_last) *written = do_write; return result; } int __gconv_transform_utf8_internal (struct gconv_step *step, struct gconv_step_data *data, const char *inbuf, size_t *inlen, size_t *written, int do_flush) { struct gconv_step *next_step = step + 1; struct gconv_step_data *next_data = data + 1; gconv_fct fct = next_step->fct; size_t do_write; int result; /* If the function is called with no input this means we have to reset to the initial state. The possibly partly converted input is dropped. */ if (do_flush) { /* Clear the state. */ memset (data->statep, '\0', sizeof (mbstate_t)); do_write = 0; /* Call the steps down the chain if there are any. */ if (data->is_last) result = GCONV_OK; else { struct gconv_step *next_step = step + 1; struct gconv_step_data *next_data = data + 1; result = (*fct) (next_step, next_data, NULL, 0, written, 1); } } else { int save_errno = errno; int extra = 0; do_write = 0; result = GCONV_OK; do { wchar_t *outbuf = (wchar_t *) &data->outbuf[data->outbufavail]; size_t cnt = 0; size_t actually = 0; while (data->outbufavail + sizeof (wchar_t) <= data->outbufsize && cnt < *inlen) { size_t start = cnt; wchar_t value; unsigned char byte; int count; /* Next input byte. */ byte = inbuf[cnt++]; if (byte < 0x80) { /* One byte sequence. */ count = 0; value = byte; } else if ((byte & 0xe0) == 0xc0) { count = 1; value = byte & 0x1f; } else if ((byte & 0xf0) == 0xe0) { /* We expect three bytes. */ count = 2; value = byte & 0x0f; } else if ((byte & 0xf8) == 0xf0) { /* We expect four bytes. */ count = 3; value = byte & 0x07; } else if ((byte & 0xfc) == 0xf8) { /* We expect five bytes. */ count = 4; value = byte & 0x03; } else if ((byte & 0xfe) == 0xfc) { /* We expect six bytes. */ count = 5; value = byte & 0x01; } else { /* This is an illegal encoding. */ result = GCONV_ILLEGAL_INPUT; break; } if (cnt + count > *inlen) { /* We don't have enough input. */ --cnt; extra = count; break; } /* Read the possible remaining bytes. */ while (count > 0) { byte = inbuf[cnt++]; --count; if ((byte & 0xc0) != 0x80) { /* This is an illegal encoding. */ result = GCONV_ILLEGAL_INPUT; break; } value <<= 6; value |= byte & 0x3f; } if (result != GCONV_OK) { cnt = start; break; } *outbuf++ = value; ++actually; } /* Remember how much we converted. */ do_write += actually; *inlen -= cnt; inbuf += cnt; data->outbufavail += actually * sizeof (wchar_t); /* Check whether an illegal character appeared. */ if (result != GCONV_OK) { result = GCONV_ILLEGAL_INPUT; break; } if (*inlen > 0 && *inlen < extra) { /* We have an incomplete character at the end. */ result = GCONV_INCOMPLETE_INPUT; break; } if (data->is_last) { /* This is the last step. */ result = (data->outbufavail + sizeof (wchar_t) > data->outbufsize ? GCONV_FULL_OUTPUT : GCONV_EMPTY_INPUT); break; } /* Status so far. */ result = GCONV_EMPTY_INPUT; if (data->outbufavail > 0) { /* Call the functions below in the chain. */ size_t newavail = data->outbufavail; result = (*fct) (next_step, next_data, data->outbuf, &newavail, written, 0); /* Correct the output buffer. */ if (newavail != data->outbufavail && newavail > 0) { memmove (data->outbuf, &data->outbuf[data->outbufavail - newavail], newavail); data->outbufavail = newavail; } } } while (*inlen > 0 && result == GCONV_EMPTY_INPUT); __set_errno (save_errno); } if (written != NULL && data->is_last) *written = do_write; return result; } int __gconv_transform_ucs2_internal (struct gconv_step *step, struct gconv_step_data *data, const char *inbuf, size_t *inlen, size_t *written, int do_flush) { struct gconv_step *next_step = step + 1; struct gconv_step_data *next_data = data + 1; gconv_fct fct = next_step->fct; size_t do_write; int result; /* If the function is called with no input this means we have to reset to the initial state. The possibly partly converted input is dropped. */ if (do_flush) { /* Clear the state. */ memset (data->statep, '\0', sizeof (mbstate_t)); do_write = 0; /* Call the steps down the chain if there are any. */ if (data->is_last) result = GCONV_OK; else { struct gconv_step *next_step = step + 1; struct gconv_step_data *next_data = data + 1; result = (*fct) (next_step, next_data, NULL, 0, written, 1); } } else { const uint16_t *newinbuf = (const uint16_t *) inbuf; int save_errno = errno; do_write = 0; do { wchar_t *outbuf = (wchar_t *) &data->outbuf[data->outbufavail]; size_t actually = 0; errno = 0; while (data->outbufavail + 4 <= data->outbufsize && *inlen >= 2) { #if __BYTE_ORDER == __LITTLE_ENDIAN outbuf[actually++] = (wchar_t) bswap_16 (*newinbuf++); #else outbuf[actually++] = (wchar_t) *newinbuf++; #endif data->outbufavail += 4; *inlen -= 2; } /* Remember how much we converted. */ do_write += actually * sizeof (wchar_t); if (*inlen == 1) { /* We have an incomplete character at the end. */ result = GCONV_INCOMPLETE_INPUT; break; } /* Check whether an illegal character appeared. */ if (errno != 0) { result = GCONV_ILLEGAL_INPUT; break; } if (data->is_last) { /* This is the last step. */ result = (data->outbufavail + sizeof (wchar_t) > data->outbufsize ? GCONV_FULL_OUTPUT : GCONV_EMPTY_INPUT); break; } /* Status so far. */ result = GCONV_EMPTY_INPUT; if (data->outbufavail > 0) { /* Call the functions below in the chain. */ size_t newavail = data->outbufavail; result = (*fct) (next_step, next_data, data->outbuf, &newavail, written, 0); /* Correct the output buffer. */ if (newavail != data->outbufavail && newavail > 0) { memmove (data->outbuf, &data->outbuf[data->outbufavail - newavail], newavail); data->outbufavail = newavail; } } } while (*inlen > 0 && result == GCONV_EMPTY_INPUT); __set_errno (save_errno); } if (written != NULL && data->is_last) *written = do_write; return result; } int __gconv_transform_internal_ucs2 (struct gconv_step *step, struct gconv_step_data *data, const char *inbuf, size_t *inlen, size_t *written, int do_flush) { struct gconv_step *next_step = step + 1; struct gconv_step_data *next_data = data + 1; gconv_fct fct = next_step->fct; size_t do_write; int result; /* If the function is called with no input this means we have to reset to the initial state. The possibly partly converted input is dropped. */ if (do_flush) { /* Clear the state. */ memset (data->statep, '\0', sizeof (mbstate_t)); do_write = 0; /* Call the steps down the chain if there are any. */ if (data->is_last) result = GCONV_OK; else { struct gconv_step *next_step = step + 1; struct gconv_step_data *next_data = data + 1; result = (*fct) (next_step, next_data, NULL, 0, written, 1); /* Clear output buffer. */ data->outbufavail = 0; } } else { const wchar_t *newinbuf = (const wchar_t *) inbuf; int save_errno = errno; do_write = 0; do { uint16_t *outbuf = (uint16_t *) &data->outbuf[data->outbufavail]; size_t actually = 0; errno = 0; while (data->outbufavail + 2 <= data->outbufsize && *inlen >= 4) { if (*newinbuf >= 0x10000) { __set_errno (EILSEQ); break; } #if __BYTE_ORDER == __LITTLE_ENDIAN /* Please note that we use the `uint32_t' pointer as a `uint16_t' pointer which works since we are on a little endian machine. */ outbuf[actually++] = bswap_16 (*((uint16_t *) newinbuf)); ++newinbuf; #else outbuf[actually++] = *newinbuf++; #endif *inlen -= 4; data->outbufavail += 2; } /* Remember how much we converted. */ do_write += (const char *) newinbuf - inbuf; if (*inlen > 0 && *inlen < 4) { /* We have an incomplete input character. */ result = GCONV_INCOMPLETE_INPUT; break; } /* Check whether an illegal character appeared. */ if (errno != 0) { result = GCONV_ILLEGAL_INPUT; break; } if (data->is_last) { /* This is the last step. */ result = *inlen == 0 ? GCONV_EMPTY_INPUT : GCONV_FULL_OUTPUT; break; } /* Status so far. */ result = GCONV_EMPTY_INPUT; if (data->outbufavail > 0) { /* Call the functions below in the chain. */ size_t newavail = data->outbufavail; result = (*fct) (next_step, next_data, data->outbuf, &newavail, written, 0); /* Correct the output buffer. */ if (newavail != data->outbufavail && newavail > 0) { memmove (data->outbuf, &data->outbuf[data->outbufavail - newavail], newavail); data->outbufavail = newavail; } } } while (*inlen > 0 && result == GCONV_EMPTY_INPUT); __set_errno (save_errno); } if (written != NULL && data->is_last) *written = do_write / sizeof (wchar_t); return result; }