diff options
Diffstat (limited to 'libidn/idna.c')
-rw-r--r-- | libidn/idna.c | 797 |
1 files changed, 797 insertions, 0 deletions
diff --git a/libidn/idna.c b/libidn/idna.c new file mode 100644 index 0000000000..69c928fc42 --- /dev/null +++ b/libidn/idna.c @@ -0,0 +1,797 @@ +/* idna.c Convert to or from IDN strings. + * Copyright (C) 2002, 2003, 2004 Simon Josefsson + * + * This file is part of GNU Libidn. + * + * GNU Libidn is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * GNU Libidn is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GNU Libidn; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#if HAVE_CONFIG_H +# include "config.h" +#endif + +#include <stdlib.h> +#include <string.h> +#include <stringprep.h> +#include <punycode.h> + +#include "idna.h" + +#define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || \ + (c) == 0xFF0E || (c) == 0xFF61) + +/* Core functions */ + +/** + * idna_to_ascii_4i + * @in: input array with unicode code points. + * @inlen: length of input array with unicode code points. + * @out: output zero terminated string that must have room for at + * least 63 characters plus the terminating zero. + * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES. + * + * The ToASCII operation takes a sequence of Unicode code points that make + * up one label and transforms it into a sequence of code points in the + * ASCII range (0..7F). If ToASCII succeeds, the original sequence and the + * resulting sequence are equivalent labels. + * + * It is important to note that the ToASCII operation can fail. ToASCII + * fails if any step of it fails. If any step of the ToASCII operation + * fails on any label in a domain name, that domain name MUST NOT be used + * as an internationalized domain name. The method for deadling with this + * failure is application-specific. + * + * The inputs to ToASCII are a sequence of code points, the AllowUnassigned + * flag, and the UseSTD3ASCIIRules flag. The output of ToASCII is either a + * sequence of ASCII code points or a failure condition. + * + * ToASCII never alters a sequence of code points that are all in the ASCII + * range to begin with (although it could fail). Applying the ToASCII + * operation multiple times has exactly the same effect as applying it just + * once. + * + * Return value: Returns 0 on success, or an error code. + */ +int +idna_to_ascii_4i (const uint32_t * in, size_t inlen, char *out, int flags) +{ + size_t len, outlen; + uint32_t *src; /* XXX don't need to copy data? */ + int rc; + + /* + * ToASCII consists of the following steps: + * + * 1. If all code points in the sequence are in the ASCII range (0..7F) + * then skip to step 3. + */ + + { + size_t i; + int inasciirange; + + inasciirange = 1; + for (i = 0; i < inlen; i++) + if (in[i] > 0x7F) + inasciirange = 0; + if (inasciirange) + { + src = malloc (sizeof (in[0]) * (inlen + 1)); + if (src == NULL) + return IDNA_MALLOC_ERROR; + + memcpy (src, in, sizeof (in[0]) * inlen); + src[inlen] = 0; + + goto step3; + } + } + + /* + * 2. Perform the steps specified in [NAMEPREP] and fail if there is + * an error. The AllowUnassigned flag is used in [NAMEPREP]. + */ + + { + char *p; + + p = stringprep_ucs4_to_utf8 (in, inlen, NULL, NULL); + if (p == NULL) + return IDNA_MALLOC_ERROR; + + len = strlen (p); + do + { + len = 2 * len + 10; /* XXX better guess? */ + p = realloc (p, len); + if (p == NULL) + return IDNA_MALLOC_ERROR; + + if (flags & IDNA_ALLOW_UNASSIGNED) + rc = stringprep_nameprep (p, len); + else + rc = stringprep_nameprep_no_unassigned (p, len); + } + while (rc == STRINGPREP_TOO_SMALL_BUFFER); + + if (rc != STRINGPREP_OK) + { + free (p); + return IDNA_STRINGPREP_ERROR; + } + + src = stringprep_utf8_to_ucs4 (p, -1, NULL); + + free (p); + } + +step3: + /* + * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks: + * + * (a) Verify the absence of non-LDH ASCII code points; that is, + * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F. + * + * (b) Verify the absence of leading and trailing hyphen-minus; + * that is, the absence of U+002D at the beginning and end of + * the sequence. + */ + + if (flags & IDNA_USE_STD3_ASCII_RULES) + { + size_t i; + + for (i = 0; src[i]; i++) + if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F || + (src[i] >= 0x3A && src[i] <= 0x40) || + (src[i] >= 0x5B && src[i] <= 0x60) || + (src[i] >= 0x7B && src[i] <= 0x7F)) + { + free (src); + return IDNA_CONTAINS_NON_LDH; + } + + if (src[0] == 0x002D || (i > 0 && src[i - 1] == 0x002D)) + { + free (src); + return IDNA_CONTAINS_MINUS; + } + } + + /* + * 4. If all code points in the sequence are in the ASCII range + * (0..7F), then skip to step 8. + */ + + { + size_t i; + int inasciirange; + + inasciirange = 1; + for (i = 0; src[i]; i++) + { + if (src[i] > 0x7F) + inasciirange = 0; + /* copy string to output buffer if we are about to skip to step8 */ + if (i < 64) + out[i] = src[i]; + } + if (i < 64) + out[i] = '\0'; + if (inasciirange) + goto step8; + } + + /* + * 5. Verify that the sequence does NOT begin with the ACE prefix. + * + */ + + { + size_t i; + int match; + + match = 1; + for (i = 0; match && i < strlen (IDNA_ACE_PREFIX); i++) + if (((uint32_t) IDNA_ACE_PREFIX[i] & 0xFF) != src[i]) + match = 0; + if (match) + { + free (src); + return IDNA_CONTAINS_ACE_PREFIX; + } + } + + /* + * 6. Encode the sequence using the encoding algorithm in [PUNYCODE] + * and fail if there is an error. + */ + for (len = 0; src[len]; len++) + ; + src[len] = '\0'; + outlen = 63 - strlen (IDNA_ACE_PREFIX); + rc = punycode_encode (len, src, NULL, + &outlen, &out[strlen (IDNA_ACE_PREFIX)]); + if (rc != PUNYCODE_SUCCESS) + { + free (src); + return IDNA_PUNYCODE_ERROR; + } + out[strlen (IDNA_ACE_PREFIX) + outlen] = '\0'; + + /* + * 7. Prepend the ACE prefix. + */ + + memcpy (out, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX)); + + /* + * 8. Verify that the number of code points is in the range 1 to 63 + * inclusive (0 is excluded). + */ + +step8: + free (src); + if (strlen (out) < 1 || strlen (out) > 63) + return IDNA_INVALID_LENGTH; + + return IDNA_SUCCESS; +} + +/* ToUnicode(). May realloc() utf8in. */ +static int +idna_to_unicode_internal (char *utf8in, + uint32_t * out, size_t * outlen, int flags) +{ + int rc; + char tmpout[64]; + size_t utf8len = strlen (utf8in) + 1; + size_t addlen = 0; + + /* + * ToUnicode consists of the following steps: + * + * 1. If the sequence contains any code points outside the ASCII range + * (0..7F) then proceed to step 2, otherwise skip to step 3. + */ + + { + size_t i; + int inasciirange; + + inasciirange = 1; + for (i = 0; utf8in[i]; i++) + if (utf8in[i] & ~0x7F) + inasciirange = 0; + if (inasciirange) + goto step3; + } + + /* + * 2. Perform the steps specified in [NAMEPREP] and fail if there is an + * error. (If step 3 of ToASCII is also performed here, it will not + * affect the overall behavior of ToUnicode, but it is not + * necessary.) The AllowUnassigned flag is used in [NAMEPREP]. + */ + do + { + utf8in = realloc (utf8in, utf8len + addlen); + if (!utf8in) + return IDNA_MALLOC_ERROR; + if (flags & IDNA_ALLOW_UNASSIGNED) + rc = stringprep_nameprep (utf8in, utf8len + addlen); + else + rc = stringprep_nameprep_no_unassigned (utf8in, utf8len + addlen); + addlen += 1; + } + while (rc == STRINGPREP_TOO_SMALL_BUFFER); + + if (rc != STRINGPREP_OK) + return IDNA_STRINGPREP_ERROR; + + /* 3. Verify that the sequence begins with the ACE prefix, and save a + * copy of the sequence. + */ + +step3: + if (memcmp (IDNA_ACE_PREFIX, utf8in, strlen (IDNA_ACE_PREFIX)) != 0) + return IDNA_NO_ACE_PREFIX; + + /* 4. Remove the ACE prefix. + */ + + memmove (utf8in, &utf8in[strlen (IDNA_ACE_PREFIX)], + strlen (utf8in) - strlen (IDNA_ACE_PREFIX) + 1); + + /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE] + * and fail if there is an error. Save a copy of the result of + * this step. + */ + + (*outlen)--; /* reserve one for the zero */ + + rc = punycode_decode (strlen (utf8in), utf8in, outlen, out, NULL); + if (rc != PUNYCODE_SUCCESS) + return IDNA_PUNYCODE_ERROR; + + out[*outlen] = 0; /* add zero */ + + /* 6. Apply ToASCII. + */ + + rc = idna_to_ascii_4i (out, *outlen, tmpout, flags); + if (rc != IDNA_SUCCESS) + return rc; + + /* 7. Verify that the result of step 6 matches the saved copy from + * step 3, using a case-insensitive ASCII comparison. + */ + + if (strcasecmp (utf8in, tmpout + strlen (IDNA_ACE_PREFIX)) != 0) + return IDNA_ROUNDTRIP_VERIFY_ERROR; + + /* 8. Return the saved copy from step 5. + */ + + return IDNA_SUCCESS; +} + +/** + * idna_to_unicode_44i + * @in: input array with unicode code points. + * @inlen: length of input array with unicode code points. + * @out: output array with unicode code points. + * @outlen: on input, maximum size of output array with unicode code points, + * on exit, actual size of output array with unicode code points. + * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES. + * + * The ToUnicode operation takes a sequence of Unicode code points + * that make up one label and returns a sequence of Unicode code + * points. If the input sequence is a label in ACE form, then the + * result is an equivalent internationalized label that is not in ACE + * form, otherwise the original sequence is returned unaltered. + * + * ToUnicode never fails. If any step fails, then the original input + * sequence is returned immediately in that step. + * + * The Punycode decoder can never output more code points than it + * inputs, but Nameprep can, and therefore ToUnicode can. Note that + * the number of octets needed to represent a sequence of code points + * depends on the particular character encoding used. + * + * The inputs to ToUnicode are a sequence of code points, the + * AllowUnassigned flag, and the UseSTD3ASCIIRules flag. The output of + * ToUnicode is always a sequence of Unicode code points. + * + * Return value: Returns error condition, but it must only be used for + * debugging purposes. The output buffer is always + * guaranteed to contain the correct data according to + * the specification (sans malloc induced errors). NB! + * This means that you normally ignore the return code + * from this function, as checking it means breaking the + * standard. + */ +int +idna_to_unicode_44i (const uint32_t * in, size_t inlen, + uint32_t * out, size_t * outlen, int flags) +{ + int rc; + size_t outlensave = *outlen; + char *p; + + p = stringprep_ucs4_to_utf8 (in, inlen, NULL, NULL); + if (p == NULL) + return IDNA_MALLOC_ERROR; + + rc = idna_to_unicode_internal (p, out, outlen, flags); + if (rc != IDNA_SUCCESS) + { + memcpy (out, in, sizeof (in[0]) * (inlen < outlensave ? + inlen : outlensave)); + *outlen = inlen; + } + + free (p); + + return rc; +} + +/* Wrappers that handle several labels */ + +/** + * idna_to_ascii_4z: + * @input: zero terminated input Unicode string. + * @output: pointer to newly allocated output string. + * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES. + * + * Convert UCS-4 domain name to ASCII string. The domain name may + * contain several labels, separated by dots. The output buffer must + * be deallocated by the caller. + * + * Return value: Returns IDNA_SUCCESS on success, or error code. + **/ +int +idna_to_ascii_4z (const uint32_t * input, char **output, int flags) +{ + const uint32_t *start = input; + const uint32_t *end = input; + char buf[64]; + char *out = NULL; + int rc; + + /* 1) Whenever dots are used as label separators, the following + characters MUST be recognized as dots: U+002E (full stop), + U+3002 (ideographic full stop), U+FF0E (fullwidth full stop), + U+FF61 (halfwidth ideographic full stop). */ + + if (input[0] == 0) + { + /* Handle implicit zero-length root label. */ + *output = malloc (1); + if (!*output) + return IDNA_MALLOC_ERROR; + strcpy (*output, ""); + return IDNA_SUCCESS; + } + + if (DOTP (input[0]) && input[1] == 0) + { + /* Handle explicit zero-length root label. */ + *output = malloc (2); + if (!*output) + return IDNA_MALLOC_ERROR; + strcpy (*output, "."); + return IDNA_SUCCESS; + } + + *output = NULL; + do + { + end = start; + + for (; *end && !DOTP (*end); end++) + ; + + if (*end == '\0' && start == end) + { + /* Handle explicit zero-length root label. */ + buf[0] = '\0'; + } + else + { + rc = idna_to_ascii_4i (start, end - start, buf, flags); + if (rc != IDNA_SUCCESS) + return rc; + } + + if (out) + { + out = realloc (out, strlen (out) + 1 + strlen (buf) + 1); + if (!out) + return IDNA_MALLOC_ERROR; + strcat (out, "."); + strcat (out, buf); + } + else + { + out = (char *) malloc (strlen (buf) + 1); + if (!out) + return IDNA_MALLOC_ERROR; + strcpy (out, buf); + } + + start = end + 1; + } + while (*end); + + *output = out; + + return IDNA_SUCCESS; +} + +/** + * idna_to_ascii_8z: + * @input: zero terminated input UTF-8 string. + * @output: pointer to newly allocated output string. + * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES. + * + * Convert UTF-8 domain name to ASCII string. The domain name may + * contain several labels, separated by dots. The output buffer must + * be deallocated by the caller. + * + * Return value: Returns IDNA_SUCCESS on success, or error code. + **/ +int +idna_to_ascii_8z (const char *input, char **output, int flags) +{ + uint32_t *ucs4; + size_t ucs4len; + int rc; + + ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len); + if (!ucs4) + return IDNA_ICONV_ERROR; + + rc = idna_to_ascii_4z (ucs4, output, flags); + + free (ucs4); + + return rc; + +} + +/** + * idna_to_ascii_lz: + * @input: zero terminated input UTF-8 string. + * @output: pointer to newly allocated output string. + * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES. + * + * Convert domain name in the locale's encoding to ASCII string. The + * domain name may contain several labels, separated by dots. The + * output buffer must be deallocated by the caller. + * + * Return value: Returns IDNA_SUCCESS on success, or error code. + **/ +int +idna_to_ascii_lz (const char *input, char **output, int flags) +{ + char *utf8; + int rc; + + utf8 = stringprep_locale_to_utf8 (input); + if (!utf8) + return IDNA_ICONV_ERROR; + + rc = idna_to_ascii_8z (utf8, output, flags); + + free (utf8); + + return rc; +} + +/** + * idna_to_unicode_4z4z: + * @input: zero-terminated Unicode string. + * @output: pointer to newly allocated output Unicode string. + * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES. + * + * Convert possibly ACE encoded domain name in UCS-4 format into a + * UCS-4 string. The domain name may contain several labels, + * separated by dots. The output buffer must be deallocated by the + * caller. + * + * Return value: Returns IDNA_SUCCESS on success, or error code. + **/ +int +idna_to_unicode_4z4z (const uint32_t * input, uint32_t ** output, int flags) +{ + const uint32_t *start = input; + const uint32_t *end = input; + uint32_t *buf; + size_t buflen; + uint32_t *out = NULL; + size_t outlen = 0; + int rc; + + *output = NULL; + + do + { + end = start; + + for (; *end && !DOTP (*end); end++) + ; + + buflen = end - start; + buf = malloc (sizeof (buf[0]) * (buflen + 1)); + if (!buf) + return IDNA_MALLOC_ERROR; + + rc = idna_to_unicode_44i (start, end - start, buf, &buflen, flags); + /* don't check rc as per specification! */ + + if (out) + { + out = realloc (out, sizeof (out[0]) * (outlen + 1 + buflen + 1)); + if (!out) + return IDNA_MALLOC_ERROR; + out[outlen++] = 0x002E; /* '.' (full stop) */ + memcpy (out + outlen, buf, sizeof (buf[0]) * buflen); + outlen += buflen; + out[outlen] = 0x0; + free (buf); + } + else + { + out = buf; + outlen = buflen; + out[outlen] = 0x0; + } + + start = end + 1; + } + while (*end); + + *output = out; + + return IDNA_SUCCESS; +} + +/** + * idna_to_unicode_8z4z: + * @input: zero-terminated UTF-8 string. + * @output: pointer to newly allocated output Unicode string. + * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES. + * + * Convert possibly ACE encoded domain name in UTF-8 format into a + * UCS-4 string. The domain name may contain several labels, + * separated by dots. The output buffer must be deallocated by the + * caller. + * + * Return value: Returns IDNA_SUCCESS on success, or error code. + **/ +int +idna_to_unicode_8z4z (const char *input, uint32_t ** output, int flags) +{ + uint32_t *ucs4; + size_t ucs4len; + int rc; + + ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len); + if (!ucs4) + return IDNA_ICONV_ERROR; + + rc = idna_to_unicode_4z4z (ucs4, output, flags); + free (ucs4); + + return rc; +} + +/** + * idna_to_unicode_8z8z: + * @input: zero-terminated UTF-8 string. + * @output: pointer to newly allocated output UTF-8 string. + * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES. + * + * Convert possibly ACE encoded domain name in UTF-8 format into a + * UTF-8 string. The domain name may contain several labels, + * separated by dots. The output buffer must be deallocated by the + * caller. + * + * Return value: Returns IDNA_SUCCESS on success, or error code. + **/ +int +idna_to_unicode_8z8z (const char *input, char **output, int flags) +{ + uint32_t *ucs4; + int rc; + + rc = idna_to_unicode_8z4z (input, &ucs4, flags); + *output = stringprep_ucs4_to_utf8 (ucs4, -1, NULL, NULL); + free (ucs4); + + if (!*output) + return IDNA_ICONV_ERROR; + + return rc; +} + +/** + * idna_to_unicode_8zlz: + * @input: zero-terminated UTF-8 string. + * @output: pointer to newly allocated output string encoded in the + * current locale's character set. + * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES. + * + * Convert possibly ACE encoded domain name in UTF-8 format into a + * string encoded in the current locale's character set. The domain + * name may contain several labels, separated by dots. The output + * buffer must be deallocated by the caller. + * + * Return value: Returns IDNA_SUCCESS on success, or error code. + **/ +int +idna_to_unicode_8zlz (const char *input, char **output, int flags) +{ + char *utf8; + int rc; + + rc = idna_to_unicode_8z8z (input, &utf8, flags); + *output = stringprep_utf8_to_locale (utf8); + free (utf8); + + if (!*output) + return IDNA_ICONV_ERROR; + + return rc; +} + +/** + * idna_to_unicode_lzlz: + * @input: zero-terminated string encoded in the current locale's + * character set. + * @output: pointer to newly allocated output string encoded in the + * current locale's character set. + * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES. + * + * Convert possibly ACE encoded domain name in the locale's character + * set into a string encoded in the current locale's character set. + * The domain name may contain several labels, separated by dots. The + * output buffer must be deallocated by the caller. + * + * Return value: Returns IDNA_SUCCESS on success, or error code. + **/ +int +idna_to_unicode_lzlz (const char *input, char **output, int flags) +{ + char *utf8; + int rc; + + utf8 = stringprep_locale_to_utf8 (input); + if (!utf8) + return IDNA_ICONV_ERROR; + + rc = idna_to_unicode_8zlz (utf8, output, flags); + free (utf8); + + return rc; +} + +/** + * IDNA_ACE_PREFIX + * + * The IANA allocated prefix to use for IDNA. "xn--" + */ + +/** + * Idna_rc: + * @IDNA_SUCCESS: Successful operation. This value is guaranteed to + * always be zero, the remaining ones are only guaranteed to hold + * non-zero values, for logical comparison purposes. + * @IDNA_STRINGPREP_ERROR: Error during string preparation. + * @IDNA_PUNYCODE_ERROR: Error during punycode operation. + * @IDNA_CONTAINS_NON_LDH: For IDNA_USE_STD3_ASCII_RULES, indicate that + * the string contains non-LDH ASCII characters. + * @IDNA_CONTAINS_MINUS: For IDNA_USE_STD3_ASCII_RULES, indicate that + * the string contains a leading or trailing hyphen-minus (U+002D). + * @IDNA_INVALID_LENGTH: The final output string is not within the + * (inclusive) range 1 to 63 characters. + * @IDNA_NO_ACE_PREFIX: The string does not contain the ACE prefix + * (for ToUnicode). + * @IDNA_ROUNDTRIP_VERIFY_ERROR: The ToASCII operation on output + * string does not equal the input. + * @IDNA_CONTAINS_ACE_PREFIX: The input contains the ACE prefix (for + * ToASCII). + * @IDNA_ICONV_ERROR: Could not convert string in locale encoding. + * @IDNA_MALLOC_ERROR: Could not allocate buffer (this is typically a + * fatal error). + * + * Enumerated return codes of idna_to_ascii_4i(), + * idna_to_unicode_44i() functions (and functions derived from those + * functions). The value 0 is guaranteed to always correspond to + * success. + */ + + +/** + * Idna_flags: + * @IDNA_ALLOW_UNASSIGNED: Don't reject strings containing unassigned + * Unicode code points. + * @IDNA_USE_STD3_ASCII_RULES: Validate strings according to STD3 + * rules (i.e., normal host name rules). + * + * Flags to pass to idna_to_ascii_4i(), idna_to_unicode_44i() etc. + */ |