diff options
author | Carlos O'Donell <carlos@systemhalted.org> | 2015-12-09 22:27:41 -0500 |
---|---|---|
committer | Carlos O'Donell <carlos@systemhalted.org> | 2015-12-09 22:52:13 -0500 |
commit | dd8e8e547647bf7a3f6feb816a848a846feeaf14 (patch) | |
tree | a2565747c02ddaa9b178a5aa9de6fa42aa5ae979 /localedata/unicode-gen | |
parent | 40b59cace2fd5e5aa04367073a54efc995059376 (diff) | |
download | glibc-dd8e8e547647bf7a3f6feb816a848a846feeaf14.tar glibc-dd8e8e547647bf7a3f6feb816a848a846feeaf14.tar.gz glibc-dd8e8e547647bf7a3f6feb816a848a846feeaf14.tar.bz2 glibc-dd8e8e547647bf7a3f6feb816a848a846feeaf14.zip |
Update transliteration support to Unicode 7.0.0.
The transliteration files are now autogenerated from upstream Unicode
data.
Diffstat (limited to 'localedata/unicode-gen')
-rw-r--r-- | localedata/unicode-gen/Makefile | 42 | ||||
-rw-r--r-- | localedata/unicode-gen/gen_translit_circle.py | 150 | ||||
-rw-r--r-- | localedata/unicode-gen/gen_translit_cjk_compat.py | 220 | ||||
-rw-r--r-- | localedata/unicode-gen/gen_translit_combining.py | 442 | ||||
-rw-r--r-- | localedata/unicode-gen/gen_translit_compat.py | 326 | ||||
-rw-r--r-- | localedata/unicode-gen/gen_translit_font.py | 156 | ||||
-rw-r--r-- | localedata/unicode-gen/gen_translit_fraction.py | 197 | ||||
-rwxr-xr-x | localedata/unicode-gen/gen_unicode_ctype.py | 497 | ||||
-rw-r--r-- | localedata/unicode-gen/unicode_utils.py | 502 | ||||
-rwxr-xr-x | localedata/unicode-gen/utf8_compatibility.py | 217 | ||||
-rwxr-xr-x | localedata/unicode-gen/utf8_gen.py | 28 |
11 files changed, 2112 insertions, 665 deletions
diff --git a/localedata/unicode-gen/Makefile b/localedata/unicode-gen/Makefile index 166ee310d8..920bf0eec8 100644 --- a/localedata/unicode-gen/Makefile +++ b/localedata/unicode-gen/Makefile @@ -41,7 +41,7 @@ PYTHON3 = python3 WGET = wget DOWNLOADS = UnicodeData.txt DerivedCoreProperties.txt EastAsianWidth.txt -GENERATED = i18n UTF-8 +GENERATED = i18n UTF-8 translit_combining translit_compat translit_circle translit_cjk_compat translit_font translit_fraction REPORTS = i18n-report UTF-8-report all: $(GENERATED) @@ -51,6 +51,12 @@ check: check-i18n check-UTF-8 install: cp -p i18n ../locales/i18n cp -p UTF-8 ../charmaps/UTF-8 + cp -p translit_combining ../locales/translit_combining + cp -p translit_compat ../locales/translit_compat + cp -p translit_circle ../locales/translit_circle + cp -p translit_cjk_compat ../locales/translit_cjk_compat + cp -p translit_font ../locales/translit_font + cp -p translit_fraction ../locales/translit_fraction clean: mostlyclean -rm -rf __pycache__ @@ -82,13 +88,43 @@ UTF-8: utf8_gen.py UTF-8-report: UTF-8 ../charmaps/UTF-8 UTF-8-report: utf8_compatibility.py - $(PYTHON3) ./utf8_compatibility.py -o ../charmaps/UTF-8 \ - -n UTF-8 -a -m > $@ + $(PYTHON3) ./utf8_compatibility.py -u UnicodeData.txt \ + -e EastAsianWidth.txt -o ../charmaps/UTF-8 \ + -n UTF-8 -a -m -c > $@ check-UTF-8: UTF-8-report @if grep '^Total.*: [^0]' UTF-8-report; \ then echo manual verification required; false; else true; fi +translit_combining: UnicodeData.txt +translit_combining: gen_translit_combining.py + $(PYTHON3) ./gen_translit_combining.py -u UnicodeData.txt \ + -o $@ --unicode_version $(UNICODE_VERSION) + +translit_compat: UnicodeData.txt +translit_compat: gen_translit_compat.py + $(PYTHON3) ./gen_translit_compat.py -u UnicodeData.txt \ + -o $@ --unicode_version $(UNICODE_VERSION) + +translit_circle: UnicodeData.txt +translit_circle: gen_translit_circle.py + $(PYTHON3) ./gen_translit_circle.py -u UnicodeData.txt \ + -o $@ --unicode_version $(UNICODE_VERSION) + +translit_cjk_compat: UnicodeData.txt +translit_cjk_compat: gen_translit_cjk_compat.py + $(PYTHON3) ./gen_translit_cjk_compat.py -u UnicodeData.txt \ + -o $@ --unicode_version $(UNICODE_VERSION) + +translit_font: UnicodeData.txt +translit_font: gen_translit_font.py + $(PYTHON3) ./gen_translit_font.py -u UnicodeData.txt \ + -o $@ --unicode_version $(UNICODE_VERSION) + +translit_fraction: UnicodeData.txt +translit_fraction: gen_translit_fraction.py + $(PYTHON3) ./gen_translit_fraction.py -u UnicodeData.txt \ + -o $@ --unicode_version $(UNICODE_VERSION) .PHONY: downloads clean-downloads downloads: $(DOWNLOADS) diff --git a/localedata/unicode-gen/gen_translit_circle.py b/localedata/unicode-gen/gen_translit_circle.py new file mode 100644 index 0000000000..6142859d58 --- /dev/null +++ b/localedata/unicode-gen/gen_translit_circle.py @@ -0,0 +1,150 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +# +# Generate a translit_circle file from a UnicodeData file. +# Copyright (C) 2015 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# <http://www.gnu.org/licenses/>. + +''' +Generate a translit_circle file from UnicodeData.txt + +To see how this script is used, call it with the “-h” option: + + $ ./gen_translit_circle -h + … prints usage message … +''' + +import argparse +import time +import unicode_utils + +def read_input_file(filename): + '''Reads the original glibc translit_circle file to get the + original head and tail. + + We want to replace only the part of the file between + “translit_start” and “translit_end” + ''' + head = tail = '' + with open(filename, mode='r') as translit_file: + for line in translit_file: + head = head + line + if line.startswith('translit_start'): + break + for line in translit_file: + if line.startswith('translit_end'): + tail = line + break + for line in translit_file: + tail = tail + line + return (head, tail) + +def output_head(translit_file, unicode_version, head=''): + '''Write the header of the output file, i.e. the part of the file + before the “translit_start” line. + ''' + if ARGS.input_file and head: + translit_file.write(head) + else: + translit_file.write('escape_char /\n') + translit_file.write('comment_char %\n') + translit_file.write('\n') + translit_file.write('% Transliterations of encircled characters.\n') + translit_file.write('% Generated automatically from UnicodeData.txt ' + + 'by gen_translit_circle.py ' + + 'on {:s} '.format(time.strftime('%Y-%m-%d')) + + 'for Unicode {:s}.\n'.format(unicode_version)) + translit_file.write('\n') + translit_file.write('LC_CTYPE\n') + translit_file.write('\n') + translit_file.write('translit_start\n') + +def output_tail(translit_file, tail=''): + '''Write the tail of the output file''' + if ARGS.input_file and tail: + translit_file.write(tail) + else: + translit_file.write('translit_end\n') + translit_file.write('\n') + translit_file.write('END LC_CTYPE\n') + +def output_transliteration(translit_file): + '''Write the new transliteration to the output file''' + translit_file.write('\n') + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): + name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] + decomposition = unicode_utils.UNICODE_ATTRIBUTES[ + code_point]['decomposition'] + if decomposition.startswith('<circle>'): + decomposition = decomposition[9:] + decomposed_code_points = [int(x, 16) + for x in decomposition.split(' ')] + translit_file.write('% {:s}\n'.format(name)) + translit_file.write('{:s} "<U0028>'.format( + unicode_utils.ucs_symbol(code_point))) + for decomposed_code_point in decomposed_code_points: + translit_file.write('{:s}'.format( + unicode_utils.ucs_symbol(decomposed_code_point))) + translit_file.write('<U0029>"\n') + translit_file.write('\n') + + +if __name__ == "__main__": + PARSER = argparse.ArgumentParser( + description=''' + Generate a translit_circle file from UnicodeData.txt. + ''') + PARSER.add_argument( + '-u', '--unicode_data_file', + nargs='?', + type=str, + default='UnicodeData.txt', + help=('The UnicodeData.txt file to read, ' + + 'default: %(default)s')) + PARSER.add_argument( + '-i', '--input_file', + nargs='?', + type=str, + help=''' The original glibc/localedata/locales/translit_combining + file.''') + PARSER.add_argument( + '-o', '--output_file', + nargs='?', + type=str, + default='translit_circle.new', + help='''The new translit_circle file, default: %(default)s. If the + original glibc/localedata/locales/translit_circle file has + been given as an option, the header up to the + “translit_start” line and the tail from the “translit_end” + line to the end of the file will be copied unchanged into the + output file. ''') + PARSER.add_argument( + '--unicode_version', + nargs='?', + required=True, + type=str, + help='The Unicode version of the input files used.') + ARGS = PARSER.parse_args() + + unicode_utils.fill_attributes(ARGS.unicode_data_file) + HEAD = TAIL = '' + if ARGS.input_file: + (HEAD, TAIL) = read_input_file(ARGS.input_file) + with open(ARGS.output_file, mode='w') as TRANSLIT_FILE: + output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD) + output_transliteration(TRANSLIT_FILE) + output_tail(TRANSLIT_FILE, tail=TAIL) diff --git a/localedata/unicode-gen/gen_translit_cjk_compat.py b/localedata/unicode-gen/gen_translit_cjk_compat.py new file mode 100644 index 0000000000..627ff6bdd9 --- /dev/null +++ b/localedata/unicode-gen/gen_translit_cjk_compat.py @@ -0,0 +1,220 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +# +# Generate a translit_cjk_compat file from a UnicodeData file. +# Copyright (C) 2015 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# <http://www.gnu.org/licenses/>. + +''' +Generate a translit_cjk_compat file from UnicodeData.txt + +To see how this script is used, call it with the “-h” option: + + $ ./gen_translit_cjk_compat -h + … prints usage message … +''' + +import argparse +import time +import sys +import unicode_utils + +def read_input_file(filename): + '''Reads the original glibc translit_cjk_compat file to get the + original head and tail. + + We want to replace only the part of the file between + “translit_start” and “translit_end” + ''' + head = tail = '' + with open(filename, mode='r') as translit_file: + for line in translit_file: + head = head + line + if line.startswith('translit_start'): + break + for line in translit_file: + if line.startswith('translit_end'): + tail = line + break + for line in translit_file: + tail = tail + line + return (head, tail) + +def output_head(translit_file, unicode_version, head=''): + '''Write the header of the output file, i.e. the part of the file + before the “translit_start” line. + ''' + if ARGS.input_file and head: + translit_file.write(head) + else: + translit_file.write('escape_char /\n') + translit_file.write('comment_char %\n') + translit_file.write('\n') + translit_file.write('% Transliterations of CJK compatibility ') + translit_file.write('characters.\n') + translit_file.write('% Generated automatically from UnicodeData.txt ' + + 'by gen_translit_cjk_compat.py ' + + 'on {:s} '.format(time.strftime('%Y-%m-%d')) + + 'for Unicode {:s}.\n'.format(unicode_version)) + translit_file.write('\n') + translit_file.write('LC_CTYPE\n') + translit_file.write('\n') + translit_file.write('translit_start\n') + +def output_tail(translit_file, tail=''): + '''Write the tail of the output file''' + if ARGS.input_file and tail: + translit_file.write(tail) + else: + translit_file.write('translit_end\n') + translit_file.write('\n') + translit_file.write('END LC_CTYPE\n') + +def special_decompose(code_point_list): + ''' + Decompositions which are not in UnicodeData.txt at all but which + were used in the original translit_cjk_compat file in glibc and + which seem to make sense. I want to keep the update of + translit_cjk_compat close to the spirit of the original file, + therefore I added this special decomposition rules here. + ''' + special_decompose_dict = { + (0x2215,): [0x002F], # ∕ → / + (0x00B2,): [0x005E, 0x0032], # ² → ^2 + (0x03BC,): [0x00B5], # μ → µ (GREEK SMALL LETTER MU → MICRO SIGN) + (0x2113,): [0x006C], # ℓ → l + (0x00B3,): [0x005E, 0x0033], # ³ → ^3 + (0x00B5,): [0x0075], # µ → u + (0x03BC, 0x2113): [0x03BC, 0x006C], # μℓ → μl + (0x0072, 0x0061, 0x0064, 0x2215, 0x0073, 0x00B2): [ + 0x0072, 0x0061, 0x0064, 0x002F, 0x0073, 0x00B2], + (0x006D, 0x2215, 0x0073, 0x00B2): [0x006D, 0x002F, 0x0073, 0x00B2], + } + if tuple(code_point_list) in special_decompose_dict: + return special_decompose_dict[tuple(code_point_list)] + else: + return code_point_list + +def output_transliteration(translit_file): + '''Write the new transliteration to the output file''' + translit_file.write('\n') + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): + name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] + decomposition = unicode_utils.UNICODE_ATTRIBUTES[ + code_point]['decomposition'] + if decomposition.startswith('<square>'): + decomposition = decomposition[9:] + decomposed_code_points = [[int(x, 16) + for x in decomposition.split(' ')]] + if decomposed_code_points[0]: + while True: + special_decomposed_code_points = special_decompose( + decomposed_code_points[-1]) + if (special_decomposed_code_points + != decomposed_code_points[-1]): + decomposed_code_points.append( + special_decomposed_code_points) + continue + special_decomposed_code_points = [] + for decomposed_code_point in decomposed_code_points[-1]: + special_decomposed_code_points += special_decompose( + [decomposed_code_point]) + if (special_decomposed_code_points + == decomposed_code_points[-1]): + break + decomposed_code_points.append( + special_decomposed_code_points) + translit_file.write('% {:s}\n'.format(name)) + translit_file.write('{:s} '.format( + unicode_utils.ucs_symbol(code_point))) + for index in range(0, len(decomposed_code_points)): + if index > 0: + translit_file.write(';') + if len(decomposed_code_points[index]) > 1: + translit_file.write('"') + for decomposed_code_point in decomposed_code_points[index]: + translit_file.write('{:s}'.format( + unicode_utils.ucs_symbol(decomposed_code_point))) + if len(decomposed_code_points[index]) > 1: + translit_file.write('"') + translit_file.write('\n') + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): + name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] + decomposition = unicode_utils.UNICODE_ATTRIBUTES[ + code_point]['decomposition'] + if decomposition and name.startswith('CJK COMPATIBILITY IDEOGRAPH'): + decomposed_code_points = [int(x, 16) + for x in decomposition.split(' ')] + if len(decomposed_code_points) != 1: + sys.stderr.write( + 'Unexpected decomposition length {:x} {:s} {:s}\n'.format( + code_point, name, decomposition)) + exit(1) + translit_file.write('% {:s}\n'.format(name)) + translit_file.write('{:s} '.format( + unicode_utils.ucs_symbol(code_point))) + for decomposed_code_point in decomposed_code_points: + translit_file.write('{:s}'.format( + unicode_utils.ucs_symbol(decomposed_code_point))) + translit_file.write('\n') + translit_file.write('\n') + +if __name__ == "__main__": + PARSER = argparse.ArgumentParser( + description=''' + Generate a translit_cjk_compat file from UnicodeData.txt. + ''') + PARSER.add_argument( + '-u', '--unicode_data_file', + nargs='?', + type=str, + default='UnicodeData.txt', + help=('The UnicodeData.txt file to read, ' + + 'default: %(default)s')) + PARSER.add_argument( + '-i', '--input_file', + nargs='?', + type=str, + help=''' The original glibc/localedata/locales/translit_cjk_compat + file.''') + PARSER.add_argument( + '-o', '--output_file', + nargs='?', + type=str, + default='translit_cjk_compat.new', + help='''The new translit_cjk_compat file, default: %(default)s. If the + original glibc/localedata/locales/translit_cjk_compat file has + been given as an option, the header up to the + “translit_start” line and the tail from the “translit_end” + line to the end of the file will be copied unchanged into the + output file. ''') + PARSER.add_argument( + '--unicode_version', + nargs='?', + required=True, + type=str, + help='The Unicode version of the input files used.') + ARGS = PARSER.parse_args() + + unicode_utils.fill_attributes(ARGS.unicode_data_file) + HEAD = TAIL = '' + if ARGS.input_file: + (HEAD, TAIL) = read_input_file(ARGS.input_file) + with open(ARGS.output_file, mode='w') as TRANSLIT_FILE: + output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD) + output_transliteration(TRANSLIT_FILE) + output_tail(TRANSLIT_FILE, tail=TAIL) diff --git a/localedata/unicode-gen/gen_translit_combining.py b/localedata/unicode-gen/gen_translit_combining.py new file mode 100644 index 0000000000..2551ce1652 --- /dev/null +++ b/localedata/unicode-gen/gen_translit_combining.py @@ -0,0 +1,442 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +# +# Generate a translit_combining file from a UnicodeData file. +# Copyright (C) 2015 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# <http://www.gnu.org/licenses/>. + +''' +Generate a translit_combining file from UnicodeData.txt + +To see how this script is used, call it with the “-h” option: + + $ ./gen_translit_combining -h + … prints usage message … +''' + +import argparse +import time +import unicode_utils + +def read_input_file(filename): + '''Reads the original glibc translit_combining file to get the + original head and tail. + + We want to replace only the part of the file between + “translit_start” and “translit_end” + ''' + head = tail = '' + with open(filename, mode='r') as translit_file: + for line in translit_file: + head = head + line + if line.startswith('translit_start'): + break + for line in translit_file: + if line.startswith('translit_end'): + tail = line + break + for line in translit_file: + tail = tail + line + return (head, tail) + +def output_head(translit_file, unicode_version, head=''): + '''Write the header of the output file, i.e. the part of the file + before the “translit_start” line. + ''' + if ARGS.input_file and head: + translit_file.write(head) + else: + translit_file.write('escape_char /\n') + translit_file.write('comment_char %\n') + translit_file.write('\n') + translit_file.write('% Transliterations that remove all ') + translit_file.write('combining characters (accents,\n') + translit_file.write('% pronounciation marks, etc.).\n') + translit_file.write('% Generated automatically from UnicodeData.txt ' + + 'by gen_translit_combining.py ' + + 'on {:s} '.format(time.strftime('%Y-%m-%d')) + + 'for Unicode {:s}.\n'.format(unicode_version)) + translit_file.write('\n') + translit_file.write('LC_CTYPE\n') + translit_file.write('\n') + translit_file.write('translit_start\n') + +def output_tail(translit_file, tail=''): + '''Write the tail of the output file''' + if ARGS.input_file and tail: + translit_file.write(tail) + else: + translit_file.write('translit_end\n') + translit_file.write('\n') + translit_file.write('END LC_CTYPE\n') + +def is_combining_remove(code_point): + '''Check whether this is a combining character which should be listed + in the section of the translit_combining file where combining + characters are replaced by empty strings. + + We ignore combining characters from many scripts here because + the original translit_combining file didn’t do this for the + combining characters from these scripts either and I am not + sure yet whether this would be useful to do for all combining + characters or not. For the moment I think it is better to keep + close to the spirit of the original file. + ''' + if not unicode_utils.is_combining(code_point): + return False + name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] + for substring in ('DEVANAGARI', + 'BENGALI', + 'CYRILLIC', + 'SYRIAC', + 'THAANA', + 'NKO', + 'GURMUKHI', + 'TAMIL', + 'GUJARATI', + 'ORIYA', + 'TELUGU', + 'KANNADA', + 'MALAYALAM', + 'SINHALA', + 'THAI', + 'LAO', + 'TIBETAN', + 'MYANMAR', + 'ETHIOPIC', + 'TAGALOG', + 'HANUNOO', + 'BUHID', + 'TAGBANWA', + 'KHMER', + 'MONGOLIAN', + 'LIMBU', + 'NEW TAI LUE', + 'BUGINESE', + 'BALINESE', + 'SUNDANESE', + 'LEPCHA', + 'IDEOGRAPHIC', + 'HANGUL', + 'SYLOTI', + 'SAURASHTRA', + 'KAYAH', + 'REJANG', + 'CHAM', + 'VARIATION SELECTOR', + 'KHAROSHTHI', + 'MUSICAL SYMBOL', + 'SAMARITAN', + 'MANDAIC', + 'TAI THAM', + 'BATAK', + 'VEDIC', + 'COPTIC', + 'TIFINAGH', + 'BAMUM', + 'JAVANESE', + 'TAI VIET', + 'MEETEI', + 'MANICHAEAN', + 'BRAHMI', + 'KAITHI', + 'CHAKMA', + 'MAHAJANI', + 'SHARADA', + 'KHOJKI', + 'KHUDAWADI', + 'GRANTHA', + 'TIRHUTA', + 'SIDDHAM', + 'MODI VOWEL', + 'MODI SIGN', + 'TAKRI', + 'BASSA VAH', + 'PAHAWH HMONG', + 'MIAO', + 'DUPLOYAN', + 'MENDE KIKAKUI' + ): + if substring in name: + return False + return True + +def canonical_decompose(code_point): + '''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings + + In some instances a canonical mapping or a compatibility mapping + may consist of a single character. For a canonical mapping, this + indicates that the character is a canonical equivalent of another + single character. For a compatibility mapping, this indicates that + the character is a compatibility equivalent of another single + character. + + A canonical mapping may also consist of a pair of characters, but + is never longer than two characters. When a canonical mapping + consists of a pair of characters, the first character may itself + be a character with a decomposition mapping, but the second + character never has a decomposition mapping. + + We ignore the canonical decomposition for code points + matching certain substrings because the original translit_combining + file didn’t include these types of characters either. I am unsure + about the usefulness of including them and want to keep close + to the spirit of the original file for the moment. + ''' + name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] + for substring in ('MUSICAL SYMBOL', + 'CJK COMPATIBILITY IDEOGRAPH', + 'BALINESE', + 'KAITHI LETTER', + 'CHAKMA VOWEL', + 'GRANTHA VOWEL', + 'TIRHUTA VOWEL', + 'SIDDHAM VOWEL'): + if substring in name: + return [] + decomposition = unicode_utils.UNICODE_ATTRIBUTES[ + code_point]['decomposition'] + if decomposition and not decomposition.startswith('<'): + decomposed_code_points = [int(x, 16) for x in decomposition.split(' ')] + if decomposed_code_points: + cd0 = canonical_decompose(decomposed_code_points[0]) + if cd0: + decomposed_code_points = cd0 + decomposed_code_points[1:] + return decomposed_code_points + else: + return [] + +def special_decompose(code_point_list): + ''' + Decompositions which are not canonical or which are not in + UnicodeData.txt at all but some of these were used in the original + translit_combining file in glibc and they seemed to make sense. + I want to keep the update of translit_combining close to the + spirit of the original file, therefore I added these special + decomposition rules here. + ''' + special_decompose_dict = { + # Ø U+00D8 is already handled in translit_neutral. But + # translit_combining is usually included after translit_neutral + # and Ǿ U+01FE LATIN CAPITAL LETTER O WITH STROKE AND ACUTE + # has a canonical decomposition to Ø U+00D8 and we want to + # further decompose this to U+004F. + (0x00D8,): [0x004F], # Ø → O + # ø U+00F8 is already handled in translit_neutral. But + # translit_combining is usually included after translit_neutral + # and ǿ U+01FF LATIN SMALL LETTER O WITH STROKE AND ACUTE + # has a canonical decomposition to ø U+00F8 and we want to + # further decompose this to U+006F. + (0x00F8,): [0x006F], # ø → o + # æ U+00E6 is already in translit_compat because ligatures + # are handled in translit_compat. But ǣ U+01E3 has a + # canonical decomposition to U+00E6, U+0304 and we want to + # further decompose this to “ae”. + (0x00E6,): [0x0061, 0x0065], # æ → ae + # Æ U+00C6 is already in translit_compat because ligatures + # are handled in translit_compat. But Ǣ U+01E2 has a + # canonical decomposition to U+00C6, U+0304 and we want to + # further decompose this to “AE” + (0x00C6,): [0x0041, 0x0045], # Æ → AE + # U+05F2 HEBREW LIGATURE YIDDISH DOUBLE YOD is already in + # translit_compat because ligatures are handled in translit_compat. + # But U+FB1F has a canonical decomposition to U+05F2 and + # we want to further decompose this to U+05D9, U+05D9. + (0x05F2,): [0x05D9, 0x05D9], # ײ → יי + # 0x2002 has a <compat> decomposition to 0x0020 in UnicodeData.txt + # But U+2000 EN QUAD has a canonical decomposition U+2002 + # and we want to further decompose this to U+0020. + (0x2002,): [0x0020], # EN SPACE → SPACE + # 0x2003 has a <compat> decomposition to 0x0020 in UnicodeData.txt + # But U+2001 EM QUAD has a canonical decomposition to U+2003 + # and we want to further decompose this to U+0020. + (0x2003,): [0x0020], # EM SPACE → SPACE + # U+2260 ≠ has the canonical decomposition U+003D U+0338 + # (= followed by ̸). After stripping the combining characters, + # the result is only = which reverses the meaning. + # Therefore, we add a special rules here for such mathematical + # negations: + (0x21AE,): [0x0021, 0x003C, 0x002D, 0x003E], # ↮ → !<-> + (0x21CD,): [0x0021, 0x003C, 0x003D], # ⇍ → !<= + (0x21CE,): [0x0021, 0x003C, 0x003D, 0x003E], # ⇎ → !<=> + (0x21CF,): [0x0021, 0x003D, 0x003E], # ⇏ → !=> + (0x2204,): [0x0021, 0x2203], # ∄ → !∃ + (0x2209,): [0x0021, 0x2208], # ∉ → !∈ + (0x220C,): [0x0021, 0x220B], # ∌ → !∋ + (0x2224,): [0x0021, 0x2223], # ∤ → !∣ + (0x2226,): [0x0021, 0x2225], # ∦ → !∥ + (0x2241,): [0x0021, 0x007E], # ≁ → !~ + (0x2244,): [0x0021, 0x007E, 0x002D], # ≄ → !~- + (0x2247,): [0x0021, 0x007E, 0x003D], # ≇ → !~= + (0x2249,): [0x0021, 0x007E, 0x007E], # ≉ → !~~ + (0x2260,): [0x0021, 0x003D], # ≠ → != + (0x2262,): [0x0021, 0x003D, 0x003D], # ≢ → !== + (0x226D,): [0x0021, 0x224D], # ≭ → !≍ + (0x226E,): [0x0021, 0x003C], # ≮ → !< + (0x226F,): [0x0021, 0x003E], # ≯ → !> + (0x2270,): [0x0021, 0x003C, 0x003D], # ≰ → !<= + (0x2271,): [0x0021, 0x003E, 0x003D], # ≱ → !>= + (0x2274,): [0x0021, 0x003C, 0x007E], # ≴ → !<~ + (0x2275,): [0x0021, 0x003E, 0x007E], # ≵ → !>~ + (0x2278,): [0x0021, 0x003C, 0x003E], # ≸ → !<> + (0x2279,): [0x0021, 0x003E, 0x003C], # ≹ → !>< + (0x2280,): [0x0021, 0x227A], # ⊀ → !≺ + (0x2281,): [0x0021, 0x227B], # ⊁ → !≻ + (0x2284,): [0x0021, 0x2282], # ⊄ → !⊂ + (0x2285,): [0x0021, 0x2283], # ⊅ → !⊃ + (0x2288,): [0x0021, 0x2282, 0x003D], # ⊈ → !⊂= + (0x2289,): [0x0021, 0x2283, 0x003D], # ⊉ → !⊃= + (0x22AC,): [0x0021, 0x22A2], # ⊬ → !⊢ + (0x22AD,): [0x0021, 0x22A8], # ⊭ → !⊨ + (0x22AE,): [0x0021, 0x22A9], # ⊮ → !⊩ + (0x22AF,): [0x0021, 0x22AB], # ⊯ → !⊫ + (0x22E0,): [0x0021, 0x227C], # ⋠ → !≼ + (0x22E1,): [0x0021, 0x227D], # ⋡ → !≽ + (0x22E2,): [0x0021, 0x2291], # ⋢ → !⊑ + (0x22E3,): [0x0021, 0x2292], # ⋣ → !⊒ + (0x22EA,): [0x0021, 0x22B2], # ⋪ → !⊲ + (0x22EB,): [0x0021, 0x22B3], # ⋫ → !⊳ + (0x22EC,): [0x0021, 0x22B4], # ⋬ → !⊴ + (0x22ED,): [0x0021, 0x22B5], # ⋭ → !⊵ + (0x2ADC,): [0x0021, 0x2ADD], # ⫝̸ → !⫝ + # Special rule for 〈 U+3008 is added + # because 〉 U+2329 has the canonical decomposition U+3008 + # and we want to further decompose this to > U+003C. + (0x3008,): [0x003C], # 〈 → < + # Special rule for 〉 U+3009 is added + # because 〉 U+232A has the canonical decomposition U+3009 + # and we want to further decompose this to < U+003E. + (0x3009,): [0x003E], # 〉→ > + } + if tuple(code_point_list) in special_decompose_dict: + return special_decompose_dict[tuple(code_point_list)] + else: + return code_point_list + +def output_combining_remove(translit_file): + '''Write the section of the translit_combining file where combining + characters are replaced by empty strings. + ''' + translit_file.write('\n') + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): + name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] + if is_combining_remove(code_point): + translit_file.write('% {:s}\n'.format(name)) + translit_file.write('{:s} ""\n'.format( + unicode_utils.ucs_symbol(code_point))) + translit_file.write('\n') + +def output_decompositions(translit_file): + '''Write the section of the translit_combining file where characters + characters are decomposed and combining characters stripped from + the decompositions. + ''' + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): + if special_decompose([code_point]) != [code_point]: + decomposed_code_points = [special_decompose([code_point])] + else: + decomposed_code_points = [canonical_decompose(code_point)] + if decomposed_code_points[0]: + while True: + special_decomposed_code_points = special_decompose( + decomposed_code_points[-1]) + if (special_decomposed_code_points + != decomposed_code_points[-1]): + decomposed_code_points.append( + special_decomposed_code_points) + continue + special_decomposed_code_points = [] + for decomposed_code_point in decomposed_code_points[-1]: + special_decomposed_code_points += special_decompose( + [decomposed_code_point]) + if (special_decomposed_code_points + == decomposed_code_points[-1]): + break + decomposed_code_points.append( + special_decomposed_code_points) + for index in range(0, len(decomposed_code_points)): + decomposed_code_points[index] = [ + x for x in decomposed_code_points[index] + if not is_combining_remove(x)] + if decomposed_code_points[0]: + translit_file.write('% {:s}\n'.format( + unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'])) + translit_file.write('{:s} '.format( + unicode_utils.ucs_symbol(code_point))) + for index in range(0, len(decomposed_code_points)): + if index > 0: + translit_file.write(';') + if len(decomposed_code_points[index]) > 1: + translit_file.write('"') + for decomposed_code_point in decomposed_code_points[index]: + translit_file.write('{:s}'.format( + unicode_utils.ucs_symbol(decomposed_code_point))) + if len(decomposed_code_points[index]) > 1: + translit_file.write('"') + translit_file.write('\n') + translit_file.write('\n') + +def output_transliteration(translit_file): + '''Write the new transliteration to the output file''' + output_combining_remove(translit_file) + output_decompositions(translit_file) + +if __name__ == "__main__": + PARSER = argparse.ArgumentParser( + description=''' + Generate a translit_combining file from UnicodeData.txt. + ''') + PARSER.add_argument( + '-u', '--unicode_data_file', + nargs='?', + type=str, + default='UnicodeData.txt', + help=('The UnicodeData.txt file to read, ' + + 'default: %(default)s')) + PARSER.add_argument( + '-i', '--input_file', + nargs='?', + type=str, + help=''' The original glibc/localedata/locales/translit_combining + file.''') + PARSER.add_argument( + '-o', '--output_file', + nargs='?', + type=str, + default='translit_combining.new', + help='''The new translit_combining file, default: %(default)s. If the + original glibc/localedata/locales/translit_combining file has + been given as an option, the header up to the + “translit_start” line and the tail from the “translit_end” + line to the end of the file will be copied unchanged into the + output file. ''') + PARSER.add_argument( + '--unicode_version', + nargs='?', + required=True, + type=str, + help='The Unicode version of the input files used.') + ARGS = PARSER.parse_args() + + unicode_utils.fill_attributes(ARGS.unicode_data_file) + HEAD = TAIL = '' + if ARGS.input_file: + (HEAD, TAIL) = read_input_file(ARGS.input_file) + with open(ARGS.output_file, mode='w') as TRANSLIT_FILE: + output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD) + output_transliteration(TRANSLIT_FILE) + output_tail(TRANSLIT_FILE, tail=TAIL) diff --git a/localedata/unicode-gen/gen_translit_compat.py b/localedata/unicode-gen/gen_translit_compat.py new file mode 100644 index 0000000000..0e824a877e --- /dev/null +++ b/localedata/unicode-gen/gen_translit_compat.py @@ -0,0 +1,326 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +# +# Generate a translit_compat file from a UnicodeData file. +# Copyright (C) 2015 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# <http://www.gnu.org/licenses/>. + +''' +Generate a translit_compat file from UnicodeData.txt + +To see how this script is used, call it with the “-h” option: + + $ ./gen_translit_compat -h + … prints usage message … +''' + +import argparse +import time +import unicode_utils + +def read_input_file(filename): + '''Reads the original glibc translit_compat file to get the + original head and tail. + + We want to replace only the part of the file between + “translit_start” and “translit_end” + ''' + head = tail = '' + with open(filename, mode='r') as translit_file: + for line in translit_file: + head = head + line + if line.startswith('translit_start'): + break + for line in translit_file: + if line.startswith('translit_end'): + tail = line + break + for line in translit_file: + tail = tail + line + return (head, tail) + +def output_head(translit_file, unicode_version, head=''): + '''Write the header of the output file, i.e. the part of the file + before the “translit_start” line. + ''' + if ARGS.input_file and head: + translit_file.write(head) + else: + translit_file.write('escape_char /\n') + translit_file.write('comment_char %\n') + translit_file.write('\n') + translit_file.write('% Transliterations of compatibility characters ') + translit_file.write('and ligatures.\n') + translit_file.write('% Generated automatically from UnicodeData.txt ' + + 'by gen_translit_compat.py ' + + 'on {:s} '.format(time.strftime('%Y-%m-%d')) + + 'for Unicode {:s}.\n'.format(unicode_version)) + translit_file.write('\n') + translit_file.write('LC_CTYPE\n') + translit_file.write('\n') + translit_file.write('translit_start\n') + +def output_tail(translit_file, tail=''): + '''Write the tail of the output file''' + if ARGS.input_file and tail: + translit_file.write(tail) + else: + translit_file.write('translit_end\n') + translit_file.write('\n') + translit_file.write('END LC_CTYPE\n') + +def compatibility_decompose(code_point): + '''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings + + “The compatibility decomposition is formed by recursively applying + the canonical and compatibility mappings, then applying the + Canonical Ordering Algorithm.” + + We don’t do the canonical decomposition here because this is + done in gen_translit_combining.py to generate translit_combining. + + And we ignore some of the possible compatibility formatting tags + here. Some of them are used in other translit_* files, not + translit_compat: + + <font>: translit_font + <circle>: translit_circle + <wide>: translit_wide + <narrow>: translit_narrow + <square>: translit_cjk_compat + <fraction>: translit_fraction + + And we ignore + + <noBreak>, <initial>, <medial>, <final>, <isolated> + + because they seem to be not useful for transliteration. + ''' + decomposition = unicode_utils.UNICODE_ATTRIBUTES[ + code_point]['decomposition'] + compatibility_tags = ( + '<compat>', '<super>', '<sub>', '<vertical>') + for compatibility_tag in compatibility_tags: + if decomposition.startswith(compatibility_tag): + decomposition = decomposition[len(compatibility_tag)+1:] + decomposed_code_points = [int(x, 16) + for x in decomposition.split(' ')] + if (len(decomposed_code_points) > 1 + and decomposed_code_points[0] == 0x0020 + and decomposed_code_points[1] >= 0x0300 + and decomposed_code_points[1] <= 0x03FF): + # Decomposes into a space followed by a combining character. + # This is not useful fo transliteration. + return [] + else: + return_value = [] + for index in range(0, len(decomposed_code_points)): + cd_code_points = compatibility_decompose( + decomposed_code_points[index]) + if cd_code_points: + return_value += cd_code_points + else: + return_value += [decomposed_code_points[index]] + return return_value + return [] + +def special_decompose(code_point_list): + ''' + Decompositions which are not in UnicodeData.txt at all but which + were used in the original translit_compat file in glibc and + which seem to make sense. I want to keep the update of + translit_compat close to the spirit of the original file, + therefore I added this special decomposition rules here. + ''' + special_decompose_dict = { + (0x03BC,): [0x0075], # μ → u + (0x02BC,): [0x0027], # ʼ → ' + } + if tuple(code_point_list) in special_decompose_dict: + return special_decompose_dict[tuple(code_point_list)] + else: + return code_point_list + +def special_ligature_decompose(code_point): + ''' + Decompositions for ligatures which are not in UnicodeData.txt at + all but which were used in the original translit_compat file in + glibc and which seem to make sense. I want to keep the update of + translit_compat close to the spirit of the original file, + therefore I added these special ligature decomposition rules here. + + ''' + special_ligature_decompose_dict = { + 0x00E6: [0x0061, 0x0065], # æ → ae + 0x00C6: [0x0041, 0x0045], # Æ → AE + # These following 5 special ligature decompositions were + # in the original glibc/localedata/locales/translit_compat file + 0x0152: [0x004F, 0x0045], # Œ → OE + 0x0153: [0x006F, 0x0065], # œ → oe + 0x05F0: [0x05D5, 0x05D5], # װ → וו + 0x05F1: [0x05D5, 0x05D9], # ױ → וי + 0x05F2: [0x05D9, 0x05D9], # ײ → יי + # The following special ligature decompositions were + # not in the original glibc/localedata/locales/translit_compat file + # U+04A4 CYRILLIC CAPITAL LIGATURE EN GHE + # → U+041D CYRILLIC CAPITAL LETTER EN, + # U+0413 CYRILLIC CAPITAL LETTER GHE + 0x04A4: [0x041D, 0x0413], # Ҥ → НГ + # U+04A5 CYRILLIC SMALL LIGATURE EN GHE + # → U+043D CYRILLIC SMALL LETTER EN, + # U+0433 CYRILLIC SMALL LETTER GHE + 0x04A5: [0x043D, 0x0433], # ҥ → нг + # U+04B4 CYRILLIC CAPITAL LIGATURE TE TSE + # → U+0422 CYRILLIC CAPITAL LETTER TE, + # U+0426 CYRILLIC CAPITAL LETTER TSE + 0x04B4: [0x0422, 0x0426], # Ҵ → ТЦ + # U+04B5 CYRILLIC SMALL LIGATURE TE TSE + # → U+0442 CYRILLIC SMALL LETTER TE, + # U+0446 CYRILLIC SMALL LETTER TSE + 0x04B5: [0x0442, 0x0446], # ҵ → тц + # U+04d4 CYRILLIC CAPITAL LIGATURE A IE + # → U+0410 CYRILLIC CAPITAL LETTER A + # U+0415;CYRILLIC CAPITAL LETTER IE + 0x04D4: [0x0410, 0x0415], # Ӕ → АЕ + # U+04D5 CYRILLIC SMALL LIGATURE A IE + # → U+0430 CYRILLIC SMALL LETTER A, + # U+0435 CYRILLIC SMALL LETTER IE + 0x04D5: [0x0430, 0x0435], # ӕ → ае + # I am not sure what to do with the following ligatures + # maybe it makes no sense to decompose them: + # U+0616 ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH + # U+06d6 ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA + # U+06d7 ARABIC SMALL HIGH LIGATURE QAF WITH LAM WITH ALEF MAKSURA + # U+fdfd ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM + # U+fe20 COMBINING LIGATURE LEFT HALF + # U+fe21 COMBINING LIGATURE RIGHT HALF + # U+fe27 COMBINING LIGATURE LEFT HALF BELOW + # U+fe28 COMBINING LIGATURE RIGHT HALF BELOW + # U+11176 MAHAJANI LIGATURE SHRI + # U+1f670 SCRIPT LIGATURE ET ORNAMENT + # U+1f671 HEAVY SCRIPT LIGATURE ET ORNAMENT + # U+1f672 LIGATURE OPEN ET ORNAMENT + # U+1f673 HEAVY LIGATURE OPEN ET ORNAMENT + } + if code_point in special_ligature_decompose_dict: + return special_ligature_decompose_dict[code_point] + else: + return [code_point] + +def output_transliteration(translit_file): + '''Write the new transliteration to the output file''' + translit_file.write('\n') + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): + name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] + decomposed_code_points = [compatibility_decompose(code_point)] + if not decomposed_code_points[0]: + if special_decompose([code_point]) != [code_point]: + decomposed_code_points[0] = special_decompose([code_point]) + else: + special_decomposed_code_points = [] + while True: + special_decomposed_code_points = special_decompose( + decomposed_code_points[-1]) + if (special_decomposed_code_points + != decomposed_code_points[-1]): + decomposed_code_points.append( + special_decomposed_code_points) + continue + special_decomposed_code_points = [] + for decomposed_code_point in decomposed_code_points[-1]: + special_decomposed_code_points += special_decompose( + [decomposed_code_point]) + if (special_decomposed_code_points + == decomposed_code_points[-1]): + break + decomposed_code_points.append( + special_decomposed_code_points) + if decomposed_code_points[0]: + translit_file.write('% {:s}\n'.format(name)) + translit_file.write('{:s} '.format( + unicode_utils.ucs_symbol(code_point))) + for index in range(0, len(decomposed_code_points)): + if index > 0: + translit_file.write(';') + translit_file.write('"') + for decomposed_code_point in decomposed_code_points[index]: + translit_file.write('{:s}'.format( + unicode_utils.ucs_symbol(decomposed_code_point))) + translit_file.write('"') + translit_file.write('\n') + elif 'LIGATURE' in name and 'ARABIC' not in name: + decomposed_code_points = special_ligature_decompose(code_point) + if decomposed_code_points[0] != code_point: + translit_file.write('% {:s}\n'.format(name)) + translit_file.write('{:s} '.format( + unicode_utils.ucs_symbol(code_point))) + translit_file.write('"') + for decomposed_code_point in decomposed_code_points: + translit_file.write('{:s}'.format( + unicode_utils.ucs_symbol(decomposed_code_point))) + translit_file.write('"') + translit_file.write('\n') + else: + print('Warning: unhandled ligature: {:x} {:s}'.format( + code_point, name)) + translit_file.write('\n') + +if __name__ == "__main__": + PARSER = argparse.ArgumentParser( + description=''' + Generate a translit_compat file from UnicodeData.txt. + ''') + PARSER.add_argument( + '-u', '--unicode_data_file', + nargs='?', + type=str, + default='UnicodeData.txt', + help=('The UnicodeData.txt file to read, ' + + 'default: %(default)s')) + PARSER.add_argument( + '-i', '--input_file', + nargs='?', + type=str, + help=''' The original glibc/localedata/locales/translit_compat + file.''') + PARSER.add_argument( + '-o', '--output_file', + nargs='?', + type=str, + default='translit_compat.new', + help='''The new translit_compat file, default: %(default)s. If the + original glibc/localedata/locales/translit_compat file has + been given as an option, the header up to the + “translit_start” line and the tail from the “translit_end” + line to the end of the file will be copied unchanged into the + output file. ''') + PARSER.add_argument( + '--unicode_version', + nargs='?', + required=True, + type=str, + help='The Unicode version of the input files used.') + ARGS = PARSER.parse_args() + + unicode_utils.fill_attributes(ARGS.unicode_data_file) + HEAD = TAIL = '' + if ARGS.input_file: + (HEAD, TAIL) = read_input_file(ARGS.input_file) + with open(ARGS.output_file, mode='w') as TRANSLIT_FILE: + output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD) + output_transliteration(TRANSLIT_FILE) + output_tail(TRANSLIT_FILE, tail=TAIL) diff --git a/localedata/unicode-gen/gen_translit_font.py b/localedata/unicode-gen/gen_translit_font.py new file mode 100644 index 0000000000..072362223f --- /dev/null +++ b/localedata/unicode-gen/gen_translit_font.py @@ -0,0 +1,156 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +# +# Generate a translit_font file from a UnicodeData file. +# Copyright (C) 2015 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# <http://www.gnu.org/licenses/>. + +''' +Generate a translit_font file from UnicodeData.txt + +To see how this script is used, call it with the “-h” option: + + $ ./gen_translit_font -h + … prints usage message … +''' + +import argparse +import time +import unicode_utils + +def read_input_file(filename): + '''Reads the original glibc translit_font file to get the + original head and tail. + + We want to replace only the part of the file between + “translit_start” and “translit_end” + ''' + head = tail = '' + with open(filename, mode='r') as translit_file: + for line in translit_file: + head = head + line + if line.startswith('translit_start'): + break + for line in translit_file: + if line.startswith('translit_end'): + tail = line + break + for line in translit_file: + tail = tail + line + return (head, tail) + +def output_head(translit_file, unicode_version, head=''): + '''Write the header of the output file, i.e. the part of the file + before the “translit_start” line. + ''' + if ARGS.input_file and head: + translit_file.write(head) + else: + translit_file.write('escape_char /\n') + translit_file.write('comment_char %\n') + translit_file.write('\n') + translit_file.write('% Transliterations of font equivalents.\n') + translit_file.write('% Generated automatically from UnicodeData.txt ' + + 'by gen_translit_font.py ' + + 'on {:s} '.format(time.strftime('%Y-%m-%d')) + + 'for Unicode {:s}.\n'.format(unicode_version)) + translit_file.write('\n') + translit_file.write('LC_CTYPE\n') + translit_file.write('\n') + translit_file.write('translit_start\n') + +def output_tail(translit_file, tail=''): + '''Write the tail of the output file''' + if ARGS.input_file and tail: + translit_file.write(tail) + else: + translit_file.write('translit_end\n') + translit_file.write('\n') + translit_file.write('END LC_CTYPE\n') + +def output_transliteration(translit_file): + '''Write the new transliteration to the output file''' + translit_file.write('\n') + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): + name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] + decomposition = unicode_utils.UNICODE_ATTRIBUTES[ + code_point]['decomposition'] + if decomposition.startswith('<font>'): + decomposition = decomposition[7:] + decomposed_code_points = [[int(x, 16) + for x in decomposition.split(' ')]] + if decomposed_code_points[0]: + translit_file.write('{:s} '.format( + unicode_utils.ucs_symbol(code_point))) + for index in range(0, len(decomposed_code_points)): + if index > 0: + translit_file.write(';') + if len(decomposed_code_points[index]) > 1: + translit_file.write('"') + for decomposed_code_point in decomposed_code_points[index]: + translit_file.write('{:s}'.format( + unicode_utils.ucs_symbol(decomposed_code_point))) + if len(decomposed_code_points[index]) > 1: + translit_file.write('"') + translit_file.write(' % {:s}\n'.format(name)) + translit_file.write('\n') + +if __name__ == "__main__": + PARSER = argparse.ArgumentParser( + description=''' + Generate a translit_font file from UnicodeData.txt. + ''') + PARSER.add_argument( + '-u', '--unicode_data_file', + nargs='?', + type=str, + default='UnicodeData.txt', + help=('The UnicodeData.txt file to read, ' + + 'default: %(default)s')) + PARSER.add_argument( + '-i', '--input_file', + nargs='?', + type=str, + help=''' The original glibc/localedata/locales/translit_font + file.''') + PARSER.add_argument( + '-o', '--output_file', + nargs='?', + type=str, + default='translit_font.new', + help='''The new translit_font file, default: %(default)s. If the + original glibc/localedata/locales/translit_font file has + been given as an option, the header up to the + “translit_start” line and the tail from the “translit_end” + line to the end of the file will be copied unchanged into the + output file. ''') + PARSER.add_argument( + '--unicode_version', + nargs='?', + required=True, + type=str, + help='The Unicode version of the input files used.') + ARGS = PARSER.parse_args() + + unicode_utils.fill_attributes(ARGS.unicode_data_file) + HEAD = TAIL = '' + if ARGS.input_file: + (HEAD, TAIL) = read_input_file(ARGS.input_file) + with open(ARGS.output_file, mode='w') as TRANSLIT_FILE: + output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD) + output_transliteration(TRANSLIT_FILE) + output_tail(TRANSLIT_FILE, tail=TAIL) diff --git a/localedata/unicode-gen/gen_translit_fraction.py b/localedata/unicode-gen/gen_translit_fraction.py new file mode 100644 index 0000000000..5bf63ea344 --- /dev/null +++ b/localedata/unicode-gen/gen_translit_fraction.py @@ -0,0 +1,197 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +# +# Generate a translit_fraction file from a UnicodeData file. +# Copyright (C) 2015 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# <http://www.gnu.org/licenses/>. + +''' +Generate a translit_fraction file from UnicodeData.txt + +To see how this script is used, call it with the “-h” option: + + $ ./gen_translit_fraction -h + … prints usage message … +''' + +import argparse +import time +import unicode_utils + +def read_input_file(filename): + '''Reads the original glibc translit_fraction file to get the + original head and tail. + + We want to replace only the part of the file between + “translit_start” and “translit_end” + ''' + head = tail = '' + with open(filename, mode='r') as translit_file: + for line in translit_file: + head = head + line + if line.startswith('translit_start'): + break + for line in translit_file: + if line.startswith('translit_end'): + tail = line + break + for line in translit_file: + tail = tail + line + return (head, tail) + +def output_head(translit_file, unicode_version, head=''): + '''Write the header of the output file, i.e. the part of the file + before the “translit_start” line. + ''' + if ARGS.input_file and head: + translit_file.write(head) + else: + translit_file.write('escape_char /\n') + translit_file.write('comment_char %\n') + translit_file.write('\n') + translit_file.write('% Transliterations of fractions.\n') + translit_file.write('% Generated automatically from UnicodeData.txt ' + + 'by gen_translit_fraction.py ' + + 'on {:s} '.format(time.strftime('%Y-%m-%d')) + + 'for Unicode {:s}.\n'.format(unicode_version)) + translit_file.write('% The replacements have been surrounded ') + translit_file.write('with spaces, because fractions are\n') + translit_file.write('% often preceded by a decimal number and ') + translit_file.write('followed by a unit or a math symbol.\n') + translit_file.write('\n') + translit_file.write('LC_CTYPE\n') + translit_file.write('\n') + translit_file.write('translit_start\n') + +def output_tail(translit_file, tail=''): + '''Write the tail of the output file''' + if ARGS.input_file and tail: + translit_file.write(tail) + else: + translit_file.write('translit_end\n') + translit_file.write('\n') + translit_file.write('END LC_CTYPE\n') + +def special_decompose(code_point_list): + ''' + Decompositions which are not in UnicodeData.txt at all but which + were used in the original translit_fraction file in glibc and + which seem to make sense. I want to keep the update of + translit_fraction close to the spirit of the original file, + therefore I added this special decomposition rules here. + ''' + special_decompose_dict = { + (0x2044,): [0x002F], # ⁄ → / + } + if tuple(code_point_list) in special_decompose_dict: + return special_decompose_dict[tuple(code_point_list)] + else: + return code_point_list + +def output_transliteration(translit_file): + '''Write the new transliteration to the output file''' + translit_file.write('\n') + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): + name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] + decomposition = unicode_utils.UNICODE_ATTRIBUTES[ + code_point]['decomposition'] + if decomposition.startswith('<fraction>'): + decomposition = decomposition[11:] + decomposed_code_points = [[int(x, 16) + for x in decomposition.split(' ')]] + if decomposed_code_points[0]: + decomposed_code_points[0] = [0x0020] \ + + decomposed_code_points[0] \ + + [0x0020] + while True: + special_decomposed_code_points = special_decompose( + decomposed_code_points[-1]) + if (special_decomposed_code_points + != decomposed_code_points[-1]): + decomposed_code_points.append( + special_decomposed_code_points) + continue + special_decomposed_code_points = [] + for decomposed_code_point in decomposed_code_points[-1]: + special_decomposed_code_points += special_decompose( + [decomposed_code_point]) + if (special_decomposed_code_points + == decomposed_code_points[-1]): + break + decomposed_code_points.append( + special_decomposed_code_points) + translit_file.write('% {:s}\n'.format(name)) + translit_file.write('{:s} '.format( + unicode_utils.ucs_symbol(code_point))) + for index in range(0, len(decomposed_code_points)): + if index > 0: + translit_file.write(';') + if len(decomposed_code_points[index]) > 1: + translit_file.write('"') + for decomposed_code_point in decomposed_code_points[index]: + translit_file.write('{:s}'.format( + unicode_utils.ucs_symbol(decomposed_code_point))) + if len(decomposed_code_points[index]) > 1: + translit_file.write('"') + translit_file.write('\n') + translit_file.write('\n') + +if __name__ == "__main__": + PARSER = argparse.ArgumentParser( + description=''' + Generate a translit_cjk_compat file from UnicodeData.txt. + ''') + PARSER.add_argument( + '-u', '--unicode_data_file', + nargs='?', + type=str, + default='UnicodeData.txt', + help=('The UnicodeData.txt file to read, ' + + 'default: %(default)s')) + PARSER.add_argument( + '-i', '--input_file', + nargs='?', + type=str, + help=''' The original glibc/localedata/locales/translit_fraction + file.''') + PARSER.add_argument( + '-o', '--output_file', + nargs='?', + type=str, + default='translit_fraction.new', + help='''The new translit_fraction file, default: %(default)s. If the + original glibc/localedata/locales/translit_fraction file has + been given as an option, the header up to the + “translit_start” line and the tail from the “translit_end” + line to the end of the file will be copied unchanged into the + output file. ''') + PARSER.add_argument( + '--unicode_version', + nargs='?', + required=True, + type=str, + help='The Unicode version of the input files used.') + ARGS = PARSER.parse_args() + + unicode_utils.fill_attributes(ARGS.unicode_data_file) + HEAD = TAIL = '' + if ARGS.input_file: + (HEAD, TAIL) = read_input_file(ARGS.input_file) + with open(ARGS.output_file, mode='w') as TRANSLIT_FILE: + output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD) + output_transliteration(TRANSLIT_FILE) + output_tail(TRANSLIT_FILE, tail=TAIL) diff --git a/localedata/unicode-gen/gen_unicode_ctype.py b/localedata/unicode-gen/gen_unicode_ctype.py index 0c74f2a849..0f064f5ba5 100755 --- a/localedata/unicode-gen/gen_unicode_ctype.py +++ b/localedata/unicode-gen/gen_unicode_ctype.py @@ -30,345 +30,9 @@ To see how this script is used, call it with the “-h” option: ''' import argparse -import sys import time import re - -# Dictionary holding the entire contents of the UnicodeData.txt file -# -# Contents of this dictionary look like this: -# -# {0: {'category': 'Cc', -# 'title': None, -# 'digit': '', -# 'name': '<control>', -# 'bidi': 'BN', -# 'combining': '0', -# 'comment': '', -# 'oldname': 'NULL', -# 'decomposition': '', -# 'upper': None, -# 'mirrored': 'N', -# 'lower': None, -# 'decdigit': '', -# 'numeric': ''}, -# … -# } -UNICODE_ATTRIBUTES = {} - -# Dictionary holding the entire contents of the DerivedCoreProperties.txt file -# -# Contents of this dictionary look like this: -# -# {917504: ['Default_Ignorable_Code_Point'], -# 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'], -# … -# } -DERIVED_CORE_PROPERTIES = {} - -def fill_attribute(code_point, fields): - '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields. - - One entry in the UNICODE_ATTRIBUTES dictionary represents one line - in the UnicodeData.txt file. - - ''' - UNICODE_ATTRIBUTES[code_point] = { - 'name': fields[1], # Character name - 'category': fields[2], # General category - 'combining': fields[3], # Canonical combining classes - 'bidi': fields[4], # Bidirectional category - 'decomposition': fields[5], # Character decomposition mapping - 'decdigit': fields[6], # Decimal digit value - 'digit': fields[7], # Digit value - 'numeric': fields[8], # Numeric value - 'mirrored': fields[9], # mirrored - 'oldname': fields[10], # Old Unicode 1.0 name - 'comment': fields[11], # comment - # Uppercase mapping - 'upper': int(fields[12], 16) if fields[12] else None, - # Lowercase mapping - 'lower': int(fields[13], 16) if fields[13] else None, - # Titlecase mapping - 'title': int(fields[14], 16) if fields[14] else None, - } - -def fill_attributes(filename): - '''Stores the entire contents of the UnicodeData.txt file - in the UNICODE_ATTRIBUTES dictionary. - - A typical line for a single code point in UnicodeData.txt looks - like this: - - 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; - - Code point ranges are indicated by pairs of lines like this: - - 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; - 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; - ''' - with open(filename, mode='r') as unicode_data_file: - fields_start = [] - for line in unicode_data_file: - fields = line.strip().split(';') - if len(fields) != 15: - sys.stderr.write( - 'short line in file "%(f)s": %(l)s\n' %{ - 'f': filename, 'l': line}) - exit(1) - if fields[2] == 'Cs': - # Surrogates are UTF-16 artefacts, - # not real characters. Ignore them. - fields_start = [] - continue - if fields[1].endswith(', First>'): - fields_start = fields - fields_start[1] = fields_start[1].split(',')[0][1:] - continue - if fields[1].endswith(', Last>'): - fields[1] = fields[1].split(',')[0][1:] - if fields[1:] != fields_start[1:]: - sys.stderr.write( - 'broken code point range in file "%(f)s": %(l)s\n' %{ - 'f': filename, 'l': line}) - exit(1) - for code_point in range( - int(fields_start[0], 16), - int(fields[0], 16)+1): - fill_attribute(code_point, fields) - fields_start = [] - continue - fill_attribute(int(fields[0], 16), fields) - fields_start = [] - -def fill_derived_core_properties(filename): - '''Stores the entire contents of the DerivedCoreProperties.txt file - in the DERIVED_CORE_PROPERTIES dictionary. - - Lines in DerivedCoreProperties.txt are either a code point range like - this: - - 0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z - - or a single code point like this: - - 00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR - - ''' - with open(filename, mode='r') as derived_core_properties_file: - for line in derived_core_properties_file: - match = re.match( - r'^(?P<codepoint1>[0-9A-F]{4,6})' - + r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?' - + r'\s*;\s*(?P<property>[a-zA-Z_]+)', - line) - if not match: - continue - start = match.group('codepoint1') - end = match.group('codepoint2') - if not end: - end = start - for code_point in range(int(start, 16), int(end, 16)+1): - prop = match.group('property') - if code_point in DERIVED_CORE_PROPERTIES: - DERIVED_CORE_PROPERTIES[code_point].append(prop) - else: - DERIVED_CORE_PROPERTIES[code_point] = [prop] - -def to_upper(code_point): - '''Returns the code point of the uppercase version - of the given code point''' - if (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['upper']): - return UNICODE_ATTRIBUTES[code_point]['upper'] - else: - return code_point - -def to_lower(code_point): - '''Returns the code point of the lowercase version - of the given code point''' - if (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['lower']): - return UNICODE_ATTRIBUTES[code_point]['lower'] - else: - return code_point - -def to_title(code_point): - '''Returns the code point of the titlecase version - of the given code point''' - if (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['title']): - return UNICODE_ATTRIBUTES[code_point]['title'] - else: - return code_point - -def is_upper(code_point): - '''Checks whether the character with this code point is uppercase''' - return (to_lower(code_point) != code_point - or (code_point in DERIVED_CORE_PROPERTIES - and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point])) - -def is_lower(code_point): - '''Checks whether the character with this code point is lowercase''' - # Some characters are defined as “Lowercase” in - # DerivedCoreProperties.txt but do not have a mapping to upper - # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is - # one of these. - return (to_upper(code_point) != code_point - # <U00DF> is lowercase, but without simple to_upper mapping. - or code_point == 0x00DF - or (code_point in DERIVED_CORE_PROPERTIES - and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point])) - -def is_alpha(code_point): - '''Checks whether the character with this code point is alphabetic''' - return ((code_point in DERIVED_CORE_PROPERTIES - and - 'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point]) - or - # Consider all the non-ASCII digits as alphabetic. - # ISO C 99 forbids us to have them in category “digit”, - # but we want iswalnum to return true on them. - (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd' - and not (code_point >= 0x0030 and code_point <= 0x0039))) - -def is_digit(code_point): - '''Checks whether the character with this code point is a digit''' - if False: - return (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd') - # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without - # a zero. Must add <0> in front of them by hand. - else: - # SUSV2 gives us some freedom for the "digit" category, but ISO C 99 - # takes it away: - # 7.25.2.1.5: - # The iswdigit function tests for any wide character that - # corresponds to a decimal-digit character (as defined in 5.2.1). - # 5.2.1: - # the 10 decimal digits 0 1 2 3 4 5 6 7 8 9 - return (code_point >= 0x0030 and code_point <= 0x0039) - -def is_outdigit(code_point): - '''Checks whether the character with this code point is outdigit''' - return (code_point >= 0x0030 and code_point <= 0x0039) - -def is_blank(code_point): - '''Checks whether the character with this code point is blank''' - return (code_point == 0x0009 # '\t' - # Category Zs without mention of '<noBreak>' - or (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs' - and '<noBreak>' not in - UNICODE_ATTRIBUTES[code_point]['decomposition'])) - -def is_space(code_point): - '''Checks whether the character with this code point is a space''' - # Don’t make U+00A0 a space. Non-breaking space means that all programs - # should treat it like a punctuation character, not like a space. - return (code_point == 0x0020 # ' ' - or code_point == 0x000C # '\f' - or code_point == 0x000A # '\n' - or code_point == 0x000D # '\r' - or code_point == 0x0009 # '\t' - or code_point == 0x000B # '\v' - # Categories Zl, Zp, and Zs without mention of "<noBreak>" - or (UNICODE_ATTRIBUTES[code_point]['name'] - and - (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp'] - or - (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs'] - and - '<noBreak>' not in - UNICODE_ATTRIBUTES[code_point]['decomposition'])))) - -def is_cntrl(code_point): - '''Checks whether the character with this code point is - a control character''' - return (UNICODE_ATTRIBUTES[code_point]['name'] - and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>' - or - UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp'])) - -def is_xdigit(code_point): - '''Checks whether the character with this code point is - a hexadecimal digit''' - if False: - return (is_digit(code_point) - or (code_point >= 0x0041 and code_point <= 0x0046) - or (code_point >= 0x0061 and code_point <= 0x0066)) - else: - # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99 - # takes it away: - # 7.25.2.1.12: - # The iswxdigit function tests for any wide character that - # corresponds to a hexadecimal-digit character (as defined - # in 6.4.4.1). - # 6.4.4.1: - # hexadecimal-digit: one of - # 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F - return ((code_point >= 0x0030 and code_point <= 0x0039) - or (code_point >= 0x0041 and code_point <= 0x0046) - or (code_point >= 0x0061 and code_point <= 0x0066)) - -def is_graph(code_point): - '''Checks whether the character with this code point is - a graphical character''' - return (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>' - and not is_space(code_point)) - -def is_print(code_point): - '''Checks whether the character with this code point is printable''' - return (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>' - and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp']) - -def is_punct(code_point): - '''Checks whether the character with this code point is punctuation''' - if False: - return (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P')) - else: - # The traditional POSIX definition of punctuation is every graphic, - # non-alphanumeric character. - return (is_graph(code_point) - and not is_alpha(code_point) - and not is_digit(code_point)) - -def is_combining(code_point): - '''Checks whether the character with this code point is - a combining character''' - # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt - # file. In 3.0.1 it was identical to the union of the general categories - # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the - # PropList.txt file, so we take the latter definition. - return (UNICODE_ATTRIBUTES[code_point]['name'] - and - UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me']) - -def is_combining_level3(code_point): - '''Checks whether the character with this code point is - a combining level3 character''' - return (is_combining(code_point) - and - int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200)) - -def ucs_symbol(code_point): - '''Return the UCS symbol string for a Unicode character.''' - if code_point < 0x10000: - return '<U{:04X}>'.format(code_point) - else: - return '<U{:08X}>'.format(code_point) - -def ucs_symbol_range(code_point_low, code_point_high): - '''Returns a string UCS symbol string for a code point range. - - Example: - - <U0041>..<U005A> - ''' - return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high) +import unicode_utils def code_point_ranges(is_class_function): '''Returns a list of ranges of code points for which is_class_function @@ -379,7 +43,7 @@ def code_point_ranges(is_class_function): [[65, 90], [192, 214], [216, 222], [256], … ] ''' cp_ranges = [] - for code_point in sorted(UNICODE_ATTRIBUTES): + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): if is_class_function(code_point): if (cp_ranges and cp_ranges[-1][-1] == code_point - 1): @@ -413,9 +77,9 @@ def output_charclass(i18n_file, class_name, is_class_function): if line.strip(): line += ';' if len(code_point_range) == 1: - range_string = ucs_symbol(code_point_range[0]) + range_string = unicode_utils.ucs_symbol(code_point_range[0]) else: - range_string = ucs_symbol_range( + range_string = unicode_utils.ucs_symbol_range( code_point_range[0], code_point_range[-1]) if len(line+range_string) > max_column: i18n_file.write(line+'/\n') @@ -441,15 +105,15 @@ def output_charmap(i18n_file, map_name, map_function): line = prefix map_string = '' i18n_file.write('%s /\n' %map_name) - for code_point in sorted(UNICODE_ATTRIBUTES): + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): mapped = map_function(code_point) if code_point != mapped: if line.strip(): line += ';' map_string = '(' \ - + ucs_symbol(code_point) \ + + unicode_utils.ucs_symbol(code_point) \ + ',' \ - + ucs_symbol(mapped) \ + + unicode_utils.ucs_symbol(mapped) \ + ')' if len(line+map_string) > max_column: i18n_file.write(line+'/\n') @@ -459,110 +123,6 @@ def output_charmap(i18n_file, map_name, map_function): i18n_file.write(line+'\n') i18n_file.write('\n') -def verifications(): - '''Tests whether the is_* functions observe the known restrictions''' - for code_point in sorted(UNICODE_ATTRIBUTES): - # toupper restriction: "Only characters specified for the keywords - # lower and upper shall be specified. - if (to_upper(code_point) != code_point - and not (is_lower(code_point) or is_upper(code_point))): - sys.stderr.write( - ('%(sym)s is not upper|lower ' - + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{ - 'sym': ucs_symbol(code_point), - 'c': code_point, - 'uc': to_upper(code_point)}) - # tolower restriction: "Only characters specified for the keywords - # lower and upper shall be specified. - if (to_lower(code_point) != code_point - and not (is_lower(code_point) or is_upper(code_point))): - sys.stderr.write( - ('%(sym)s is not upper|lower ' - + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{ - 'sym': ucs_symbol(code_point), - 'c': code_point, - 'uc': to_lower(code_point)}) - # alpha restriction: "Characters classified as either upper or lower - # shall automatically belong to this class. - if ((is_lower(code_point) or is_upper(code_point)) - and not is_alpha(code_point)): - sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{ - 'sym': ucs_symbol(code_point)}) - # alpha restriction: “No character specified for the keywords cntrl, - # digit, punct or space shall be specified.” - if (is_alpha(code_point) and is_cntrl(code_point)): - sys.stderr.write('%(sym)s is alpha and cntrl\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_alpha(code_point) and is_digit(code_point)): - sys.stderr.write('%(sym)s is alpha and digit\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_alpha(code_point) and is_punct(code_point)): - sys.stderr.write('%(sym)s is alpha and punct\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_alpha(code_point) and is_space(code_point)): - sys.stderr.write('%(sym)s is alpha and space\n' %{ - 'sym': ucs_symbol(code_point)}) - # space restriction: “No character specified for the keywords upper, - # lower, alpha, digit, graph or xdigit shall be specified.” - # upper, lower, alpha already checked above. - if (is_space(code_point) and is_digit(code_point)): - sys.stderr.write('%(sym)s is space and digit\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_space(code_point) and is_graph(code_point)): - sys.stderr.write('%(sym)s is space and graph\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_space(code_point) and is_xdigit(code_point)): - sys.stderr.write('%(sym)s is space and xdigit\n' %{ - 'sym': ucs_symbol(code_point)}) - # cntrl restriction: “No character specified for the keywords upper, - # lower, alpha, digit, punct, graph, print or xdigit shall be - # specified.” upper, lower, alpha already checked above. - if (is_cntrl(code_point) and is_digit(code_point)): - sys.stderr.write('%(sym)s is cntrl and digit\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_cntrl(code_point) and is_punct(code_point)): - sys.stderr.write('%(sym)s is cntrl and punct\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_cntrl(code_point) and is_graph(code_point)): - sys.stderr.write('%(sym)s is cntrl and graph\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_cntrl(code_point) and is_print(code_point)): - sys.stderr.write('%(sym)s is cntrl and print\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_cntrl(code_point) and is_xdigit(code_point)): - sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{ - 'sym': ucs_symbol(code_point)}) - # punct restriction: “No character specified for the keywords upper, - # lower, alpha, digit, cntrl, xdigit or as the <space> character shall - # be specified.” upper, lower, alpha, cntrl already checked above. - if (is_punct(code_point) and is_digit(code_point)): - sys.stderr.write('%(sym)s is punct and digit\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_punct(code_point) and is_xdigit(code_point)): - sys.stderr.write('%(sym)s is punct and xdigit\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_punct(code_point) and code_point == 0x0020): - sys.stderr.write('%(sym)s is punct\n' %{ - 'sym': ucs_symbol(code_point)}) - # graph restriction: “No character specified for the keyword cntrl - # shall be specified.” Already checked above. - - # print restriction: “No character specified for the keyword cntrl - # shall be specified.” Already checked above. - - # graph - print relation: differ only in the <space> character. - # How is this possible if there are more than one space character?! - # I think susv2/xbd/locale.html should speak of “space characters”, - # not “space character”. - if (is_print(code_point) - and not (is_graph(code_point) or is_space(code_point))): - sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{ - 'sym': ucs_symbol(code_point)}) - if (not is_print(code_point) - and (is_graph(code_point) or code_point == 0x0020)): - sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{ - 'sym': ucs_symbol(code_point)}) - def read_input_file(filename): '''Reads the original glibc i18n file to get the original head and tail. @@ -648,18 +208,18 @@ def output_tables(i18n_file, unicode_version): + 'program.\n\n') i18n_file.write('% The "upper" class reflects the uppercase ' + 'characters of class "alpha"\n') - output_charclass(i18n_file, 'upper', is_upper) + output_charclass(i18n_file, 'upper', unicode_utils.is_upper) i18n_file.write('% The "lower" class reflects the lowercase ' + 'characters of class "alpha"\n') - output_charclass(i18n_file, 'lower', is_lower) + output_charclass(i18n_file, 'lower', unicode_utils.is_lower) i18n_file.write('% The "alpha" class of the "i18n" FDCC-set is ' + 'reflecting\n') i18n_file.write('% the recommendations in TR 10176 annex A\n') - output_charclass(i18n_file, 'alpha', is_alpha) + output_charclass(i18n_file, 'alpha', unicode_utils.is_alpha) i18n_file.write('% The "digit" class must only contain the ' + 'BASIC LATIN digits, says ISO C 99\n') i18n_file.write('% (sections 7.25.2.1.5 and 5.2.1).\n') - output_charclass(i18n_file, 'digit', is_digit) + output_charclass(i18n_file, 'digit', unicode_utils.is_digit) i18n_file.write('% The "outdigit" information is by default ' + '"0" to "9". We don\'t have to\n') i18n_file.write('% provide it here since localedef will fill ' @@ -669,29 +229,30 @@ def output_tables(i18n_file, unicode_version): i18n_file.write('% outdigit /\n') i18n_file.write('% <U0030>..<U0039>\n\n') # output_charclass(i18n_file, 'outdigit', is_outdigit) - output_charclass(i18n_file, 'space', is_space) - output_charclass(i18n_file, 'cntrl', is_cntrl) - output_charclass(i18n_file, 'punct', is_punct) - output_charclass(i18n_file, 'graph', is_graph) - output_charclass(i18n_file, 'print', is_print) + output_charclass(i18n_file, 'space', unicode_utils.is_space) + output_charclass(i18n_file, 'cntrl', unicode_utils.is_cntrl) + output_charclass(i18n_file, 'punct', unicode_utils.is_punct) + output_charclass(i18n_file, 'graph', unicode_utils.is_graph) + output_charclass(i18n_file, 'print', unicode_utils.is_print) i18n_file.write('% The "xdigit" class must only contain the ' + 'BASIC LATIN digits and A-F, a-f,\n') i18n_file.write('% says ISO C 99 ' + '(sections 7.25.2.1.12 and 6.4.4.1).\n') - output_charclass(i18n_file, 'xdigit', is_xdigit) - output_charclass(i18n_file, 'blank', is_blank) - output_charmap(i18n_file, 'toupper', to_upper) - output_charmap(i18n_file, 'tolower', to_lower) - output_charmap(i18n_file, 'map "totitle";', to_title) + output_charclass(i18n_file, 'xdigit', unicode_utils.is_xdigit) + output_charclass(i18n_file, 'blank', unicode_utils.is_blank) + output_charmap(i18n_file, 'toupper', unicode_utils.to_upper) + output_charmap(i18n_file, 'tolower', unicode_utils.to_lower) + output_charmap(i18n_file, 'map "totitle";', unicode_utils.to_title) i18n_file.write('% The "combining" class reflects ISO/IEC 10646-1 ' + 'annex B.1\n') i18n_file.write('% That is, all combining characters (level 2+3).\n') - output_charclass(i18n_file, 'class "combining";', is_combining) + output_charclass(i18n_file, 'class "combining";', + unicode_utils.is_combining) i18n_file.write('% The "combining_level3" class reflects ' + 'ISO/IEC 10646-1 annex B.2\n') i18n_file.write('% That is, combining characters of level 3.\n') - output_charclass(i18n_file, - 'class "combining_level3";', is_combining_level3) + output_charclass(i18n_file, 'class "combining_level3";', + unicode_utils.is_combining_level3) if __name__ == "__main__": PARSER = argparse.ArgumentParser( @@ -739,9 +300,11 @@ if __name__ == "__main__": help='The Unicode version of the input files used.') ARGS = PARSER.parse_args() - fill_attributes(ARGS.unicode_data_file) - fill_derived_core_properties(ARGS.derived_core_properties_file) - verifications() + unicode_utils.fill_attributes( + ARGS.unicode_data_file) + unicode_utils.fill_derived_core_properties( + ARGS.derived_core_properties_file) + unicode_utils.verifications() HEAD = TAIL = '' if ARGS.input_file: (HEAD, TAIL) = read_input_file(ARGS.input_file) diff --git a/localedata/unicode-gen/unicode_utils.py b/localedata/unicode-gen/unicode_utils.py new file mode 100644 index 0000000000..ee91582823 --- /dev/null +++ b/localedata/unicode-gen/unicode_utils.py @@ -0,0 +1,502 @@ +# Utilities to generate Unicode data for glibc from upstream Unicode data. +# +# Copyright (C) 2014, 2015 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# <http://www.gnu.org/licenses/>. + +''' +This module contains utilities used by the scripts to generate +Unicode data for glibc from upstream Unicode data files. +''' + +import sys +import re + +# Dictionary holding the entire contents of the UnicodeData.txt file +# +# Contents of this dictionary look like this: +# +# {0: {'category': 'Cc', +# 'title': None, +# 'digit': '', +# 'name': '<control>', +# 'bidi': 'BN', +# 'combining': '0', +# 'comment': '', +# 'oldname': 'NULL', +# 'decomposition': '', +# 'upper': None, +# 'mirrored': 'N', +# 'lower': None, +# 'decdigit': '', +# 'numeric': ''}, +# … +# } +UNICODE_ATTRIBUTES = {} + +# Dictionary holding the entire contents of the DerivedCoreProperties.txt file +# +# Contents of this dictionary look like this: +# +# {917504: ['Default_Ignorable_Code_Point'], +# 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'], +# … +# } +DERIVED_CORE_PROPERTIES = {} + +# Dictionary holding the entire contents of the EastAsianWidths.txt file +# +# Contents of this dictionary look like this: +# +# {0: 'N', … , 45430: 'W', …} +EAST_ASIAN_WIDTHS = {} + +def fill_attribute(code_point, fields): + '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields. + + One entry in the UNICODE_ATTRIBUTES dictionary represents one line + in the UnicodeData.txt file. + + ''' + UNICODE_ATTRIBUTES[code_point] = { + 'name': fields[1], # Character name + 'category': fields[2], # General category + 'combining': fields[3], # Canonical combining classes + 'bidi': fields[4], # Bidirectional category + 'decomposition': fields[5], # Character decomposition mapping + 'decdigit': fields[6], # Decimal digit value + 'digit': fields[7], # Digit value + 'numeric': fields[8], # Numeric value + 'mirrored': fields[9], # mirrored + 'oldname': fields[10], # Old Unicode 1.0 name + 'comment': fields[11], # comment + # Uppercase mapping + 'upper': int(fields[12], 16) if fields[12] else None, + # Lowercase mapping + 'lower': int(fields[13], 16) if fields[13] else None, + # Titlecase mapping + 'title': int(fields[14], 16) if fields[14] else None, + } + +def fill_attributes(filename): + '''Stores the entire contents of the UnicodeData.txt file + in the UNICODE_ATTRIBUTES dictionary. + + A typical line for a single code point in UnicodeData.txt looks + like this: + + 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; + + Code point ranges are indicated by pairs of lines like this: + + 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; + 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; + ''' + with open(filename, mode='r') as unicode_data_file: + fields_start = [] + for line in unicode_data_file: + fields = line.strip().split(';') + if len(fields) != 15: + sys.stderr.write( + 'short line in file "%(f)s": %(l)s\n' %{ + 'f': filename, 'l': line}) + exit(1) + if fields[2] == 'Cs': + # Surrogates are UTF-16 artefacts, + # not real characters. Ignore them. + fields_start = [] + continue + if fields[1].endswith(', First>'): + fields_start = fields + fields_start[1] = fields_start[1].split(',')[0][1:] + continue + if fields[1].endswith(', Last>'): + fields[1] = fields[1].split(',')[0][1:] + if fields[1:] != fields_start[1:]: + sys.stderr.write( + 'broken code point range in file "%(f)s": %(l)s\n' %{ + 'f': filename, 'l': line}) + exit(1) + for code_point in range( + int(fields_start[0], 16), + int(fields[0], 16)+1): + fill_attribute(code_point, fields) + fields_start = [] + continue + fill_attribute(int(fields[0], 16), fields) + fields_start = [] + +def fill_derived_core_properties(filename): + '''Stores the entire contents of the DerivedCoreProperties.txt file + in the DERIVED_CORE_PROPERTIES dictionary. + + Lines in DerivedCoreProperties.txt are either a code point range like + this: + + 0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z + + or a single code point like this: + + 00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR + + ''' + with open(filename, mode='r') as derived_core_properties_file: + for line in derived_core_properties_file: + match = re.match( + r'^(?P<codepoint1>[0-9A-F]{4,6})' + + r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?' + + r'\s*;\s*(?P<property>[a-zA-Z_]+)', + line) + if not match: + continue + start = match.group('codepoint1') + end = match.group('codepoint2') + if not end: + end = start + for code_point in range(int(start, 16), int(end, 16)+1): + prop = match.group('property') + if code_point in DERIVED_CORE_PROPERTIES: + DERIVED_CORE_PROPERTIES[code_point].append(prop) + else: + DERIVED_CORE_PROPERTIES[code_point] = [prop] + +def fill_east_asian_widths(filename): + '''Stores the entire contents of the EastAsianWidths.txt file + in the EAST_ASIAN_WIDTHS dictionary. + + Lines in EastAsianWidths.txt are either a code point range like + this: + + 9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF> + + or a single code point like this: + + A015;W # Lm YI SYLLABLE WU + ''' + with open(filename, mode='r') as east_asian_widths_file: + for line in east_asian_widths_file: + match = re.match( + r'^(?P<codepoint1>[0-9A-F]{4,6})' + +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?' + +r'\s*;\s*(?P<property>[a-zA-Z]+)', + line) + if not match: + continue + start = match.group('codepoint1') + end = match.group('codepoint2') + if not end: + end = start + for code_point in range(int(start, 16), int(end, 16)+1): + EAST_ASIAN_WIDTHS[code_point] = match.group('property') + +def to_upper(code_point): + '''Returns the code point of the uppercase version + of the given code point''' + if (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['upper']): + return UNICODE_ATTRIBUTES[code_point]['upper'] + else: + return code_point + +def to_lower(code_point): + '''Returns the code point of the lowercase version + of the given code point''' + if (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['lower']): + return UNICODE_ATTRIBUTES[code_point]['lower'] + else: + return code_point + +def to_title(code_point): + '''Returns the code point of the titlecase version + of the given code point''' + if (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['title']): + return UNICODE_ATTRIBUTES[code_point]['title'] + else: + return code_point + +def is_upper(code_point): + '''Checks whether the character with this code point is uppercase''' + return (to_lower(code_point) != code_point + or (code_point in DERIVED_CORE_PROPERTIES + and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point])) + +def is_lower(code_point): + '''Checks whether the character with this code point is lowercase''' + # Some characters are defined as “Lowercase” in + # DerivedCoreProperties.txt but do not have a mapping to upper + # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is + # one of these. + return (to_upper(code_point) != code_point + # <U00DF> is lowercase, but without simple to_upper mapping. + or code_point == 0x00DF + or (code_point in DERIVED_CORE_PROPERTIES + and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point])) + +def is_alpha(code_point): + '''Checks whether the character with this code point is alphabetic''' + return ((code_point in DERIVED_CORE_PROPERTIES + and + 'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point]) + or + # Consider all the non-ASCII digits as alphabetic. + # ISO C 99 forbids us to have them in category “digit”, + # but we want iswalnum to return true on them. + (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd' + and not (code_point >= 0x0030 and code_point <= 0x0039))) + +def is_digit(code_point): + '''Checks whether the character with this code point is a digit''' + if False: + return (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd') + # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without + # a zero. Must add <0> in front of them by hand. + else: + # SUSV2 gives us some freedom for the "digit" category, but ISO C 99 + # takes it away: + # 7.25.2.1.5: + # The iswdigit function tests for any wide character that + # corresponds to a decimal-digit character (as defined in 5.2.1). + # 5.2.1: + # the 10 decimal digits 0 1 2 3 4 5 6 7 8 9 + return (code_point >= 0x0030 and code_point <= 0x0039) + +def is_outdigit(code_point): + '''Checks whether the character with this code point is outdigit''' + return (code_point >= 0x0030 and code_point <= 0x0039) + +def is_blank(code_point): + '''Checks whether the character with this code point is blank''' + return (code_point == 0x0009 # '\t' + # Category Zs without mention of '<noBreak>' + or (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs' + and '<noBreak>' not in + UNICODE_ATTRIBUTES[code_point]['decomposition'])) + +def is_space(code_point): + '''Checks whether the character with this code point is a space''' + # Don’t make U+00A0 a space. Non-breaking space means that all programs + # should treat it like a punctuation character, not like a space. + return (code_point == 0x0020 # ' ' + or code_point == 0x000C # '\f' + or code_point == 0x000A # '\n' + or code_point == 0x000D # '\r' + or code_point == 0x0009 # '\t' + or code_point == 0x000B # '\v' + # Categories Zl, Zp, and Zs without mention of "<noBreak>" + or (UNICODE_ATTRIBUTES[code_point]['name'] + and + (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp'] + or + (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs'] + and + '<noBreak>' not in + UNICODE_ATTRIBUTES[code_point]['decomposition'])))) + +def is_cntrl(code_point): + '''Checks whether the character with this code point is + a control character''' + return (UNICODE_ATTRIBUTES[code_point]['name'] + and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>' + or + UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp'])) + +def is_xdigit(code_point): + '''Checks whether the character with this code point is + a hexadecimal digit''' + if False: + return (is_digit(code_point) + or (code_point >= 0x0041 and code_point <= 0x0046) + or (code_point >= 0x0061 and code_point <= 0x0066)) + else: + # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99 + # takes it away: + # 7.25.2.1.12: + # The iswxdigit function tests for any wide character that + # corresponds to a hexadecimal-digit character (as defined + # in 6.4.4.1). + # 6.4.4.1: + # hexadecimal-digit: one of + # 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F + return ((code_point >= 0x0030 and code_point <= 0x0039) + or (code_point >= 0x0041 and code_point <= 0x0046) + or (code_point >= 0x0061 and code_point <= 0x0066)) + +def is_graph(code_point): + '''Checks whether the character with this code point is + a graphical character''' + return (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>' + and not is_space(code_point)) + +def is_print(code_point): + '''Checks whether the character with this code point is printable''' + return (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>' + and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp']) + +def is_punct(code_point): + '''Checks whether the character with this code point is punctuation''' + if False: + return (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P')) + else: + # The traditional POSIX definition of punctuation is every graphic, + # non-alphanumeric character. + return (is_graph(code_point) + and not is_alpha(code_point) + and not is_digit(code_point)) + +def is_combining(code_point): + '''Checks whether the character with this code point is + a combining character''' + # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt + # file. In 3.0.1 it was identical to the union of the general categories + # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the + # PropList.txt file, so we take the latter definition. + return (UNICODE_ATTRIBUTES[code_point]['name'] + and + UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me']) + +def is_combining_level3(code_point): + '''Checks whether the character with this code point is + a combining level3 character''' + return (is_combining(code_point) + and + int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200)) + +def ucs_symbol(code_point): + '''Return the UCS symbol string for a Unicode character.''' + if code_point < 0x10000: + return '<U{:04X}>'.format(code_point) + else: + return '<U{:08X}>'.format(code_point) + +def ucs_symbol_range(code_point_low, code_point_high): + '''Returns a string UCS symbol string for a code point range. + + Example: + + <U0041>..<U005A> + ''' + return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high) + +def verifications(): + '''Tests whether the is_* functions observe the known restrictions''' + for code_point in sorted(UNICODE_ATTRIBUTES): + # toupper restriction: "Only characters specified for the keywords + # lower and upper shall be specified. + if (to_upper(code_point) != code_point + and not (is_lower(code_point) or is_upper(code_point))): + sys.stderr.write( + ('%(sym)s is not upper|lower ' + + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{ + 'sym': ucs_symbol(code_point), + 'c': code_point, + 'uc': to_upper(code_point)}) + # tolower restriction: "Only characters specified for the keywords + # lower and upper shall be specified. + if (to_lower(code_point) != code_point + and not (is_lower(code_point) or is_upper(code_point))): + sys.stderr.write( + ('%(sym)s is not upper|lower ' + + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{ + 'sym': ucs_symbol(code_point), + 'c': code_point, + 'uc': to_lower(code_point)}) + # alpha restriction: "Characters classified as either upper or lower + # shall automatically belong to this class. + if ((is_lower(code_point) or is_upper(code_point)) + and not is_alpha(code_point)): + sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{ + 'sym': ucs_symbol(code_point)}) + # alpha restriction: “No character specified for the keywords cntrl, + # digit, punct or space shall be specified.” + if (is_alpha(code_point) and is_cntrl(code_point)): + sys.stderr.write('%(sym)s is alpha and cntrl\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_alpha(code_point) and is_digit(code_point)): + sys.stderr.write('%(sym)s is alpha and digit\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_alpha(code_point) and is_punct(code_point)): + sys.stderr.write('%(sym)s is alpha and punct\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_alpha(code_point) and is_space(code_point)): + sys.stderr.write('%(sym)s is alpha and space\n' %{ + 'sym': ucs_symbol(code_point)}) + # space restriction: “No character specified for the keywords upper, + # lower, alpha, digit, graph or xdigit shall be specified.” + # upper, lower, alpha already checked above. + if (is_space(code_point) and is_digit(code_point)): + sys.stderr.write('%(sym)s is space and digit\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_space(code_point) and is_graph(code_point)): + sys.stderr.write('%(sym)s is space and graph\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_space(code_point) and is_xdigit(code_point)): + sys.stderr.write('%(sym)s is space and xdigit\n' %{ + 'sym': ucs_symbol(code_point)}) + # cntrl restriction: “No character specified for the keywords upper, + # lower, alpha, digit, punct, graph, print or xdigit shall be + # specified.” upper, lower, alpha already checked above. + if (is_cntrl(code_point) and is_digit(code_point)): + sys.stderr.write('%(sym)s is cntrl and digit\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_cntrl(code_point) and is_punct(code_point)): + sys.stderr.write('%(sym)s is cntrl and punct\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_cntrl(code_point) and is_graph(code_point)): + sys.stderr.write('%(sym)s is cntrl and graph\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_cntrl(code_point) and is_print(code_point)): + sys.stderr.write('%(sym)s is cntrl and print\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_cntrl(code_point) and is_xdigit(code_point)): + sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{ + 'sym': ucs_symbol(code_point)}) + # punct restriction: “No character specified for the keywords upper, + # lower, alpha, digit, cntrl, xdigit or as the <space> character shall + # be specified.” upper, lower, alpha, cntrl already checked above. + if (is_punct(code_point) and is_digit(code_point)): + sys.stderr.write('%(sym)s is punct and digit\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_punct(code_point) and is_xdigit(code_point)): + sys.stderr.write('%(sym)s is punct and xdigit\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_punct(code_point) and code_point == 0x0020): + sys.stderr.write('%(sym)s is punct\n' %{ + 'sym': ucs_symbol(code_point)}) + # graph restriction: “No character specified for the keyword cntrl + # shall be specified.” Already checked above. + + # print restriction: “No character specified for the keyword cntrl + # shall be specified.” Already checked above. + + # graph - print relation: differ only in the <space> character. + # How is this possible if there are more than one space character?! + # I think susv2/xbd/locale.html should speak of “space characters”, + # not “space character”. + if (is_print(code_point) + and not (is_graph(code_point) or is_space(code_point))): + sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{ + 'sym': unicode_utils.ucs_symbol(code_point)}) + if (not is_print(code_point) + and (is_graph(code_point) or code_point == 0x0020)): + sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{ + 'sym': unicode_utils.ucs_symbol(code_point)}) diff --git a/localedata/unicode-gen/utf8_compatibility.py b/localedata/unicode-gen/utf8_compatibility.py index b84a1eb3de..3b7a94ccc9 100755 --- a/localedata/unicode-gen/utf8_compatibility.py +++ b/localedata/unicode-gen/utf8_compatibility.py @@ -30,146 +30,7 @@ To see how this script is used, call it with the “-h” option: import sys import re import argparse - -# Dictionary holding the entire contents of the UnicodeData.txt file -# -# Contents of this dictionary look like this: -# -# {0: {'category': 'Cc', -# 'title': None, -# 'digit': '', -# 'name': '<control>', -# 'bidi': 'BN', -# 'combining': '0', -# 'comment': '', -# 'oldname': 'NULL', -# 'decomposition': '', -# 'upper': None, -# 'mirrored': 'N', -# 'lower': None, -# 'decdigit': '', -# 'numeric': ''}, -# … -# } -UNICODE_ATTRIBUTES = {} - -# Dictionary holding the entire contents of the EastAsianWidths.txt file -# -# Contents of this dictionary look like this: -# -# {0: 'N', … , 45430: 'W', …} -EAST_ASIAN_WIDTHS = {} - -def fill_attribute(code_point, fields): - '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields. - - One entry in the UNICODE_ATTRIBUTES dictionary represents one line - in the UnicodeData.txt file. - - ''' - UNICODE_ATTRIBUTES[code_point] = { - 'name': fields[1], # Character name - 'category': fields[2], # General category - 'combining': fields[3], # Canonical combining classes - 'bidi': fields[4], # Bidirectional category - 'decomposition': fields[5], # Character decomposition mapping - 'decdigit': fields[6], # Decimal digit value - 'digit': fields[7], # Digit value - 'numeric': fields[8], # Numeric value - 'mirrored': fields[9], # mirrored - 'oldname': fields[10], # Old Unicode 1.0 name - 'comment': fields[11], # comment - # Uppercase mapping - 'upper': int(fields[12], 16) if fields[12] else None, - # Lowercase mapping - 'lower': int(fields[13], 16) if fields[13] else None, - # Titlecase mapping - 'title': int(fields[14], 16) if fields[14] else None, - } - -def fill_attributes(filename): - '''Stores the entire contents of the UnicodeData.txt file - in the UNICODE_ATTRIBUTES dictionary. - - A typical line for a single code point in UnicodeData.txt looks - like this: - - 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; - - Code point ranges are indicated by pairs of lines like this: - - 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; - 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; - ''' - with open(filename, mode='r') as unicode_data_file: - fields_start = [] - for line in unicode_data_file: - fields = line.strip().split(';') - if len(fields) != 15: - sys.stderr.write( - 'short line in file "%(f)s": %(l)s\n' %{ - 'f': filename, 'l': line}) - exit(1) - if fields[2] == 'Cs': - # Surrogates are UTF-16 artefacts, - # not real characters. Ignore them. - fields_start = [] - continue - if fields[1].endswith(', First>'): - fields_start = fields - fields_start[1] = fields_start[1].split(',')[0][1:] - continue - if fields[1].endswith(', Last>'): - fields[1] = fields[1].split(',')[0][1:] - if fields[1:] != fields_start[1:]: - sys.stderr.write( - 'broken code point range in file "%(f)s": %(l)s\n' %{ - 'f': filename, 'l': line}) - exit(1) - for code_point in range( - int(fields_start[0], 16), - int(fields[0], 16)+1): - fill_attribute(code_point, fields) - fields_start = [] - continue - fill_attribute(int(fields[0], 16), fields) - fields_start = [] - -def fill_east_asian_widths(filename): - '''Stores the entire contents of the EastAsianWidths.txt file - in the EAST_ASIAN_WIDTHS dictionary. - - Lines in EastAsianWidths.txt are either a code point range like - this: - - 9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF> - - or a single code point like this: - - A015;W # Lm YI SYLLABLE WU - ''' - with open(filename, mode='r') as east_asian_widths_file: - for line in east_asian_widths_file: - match = re.match( - r'^(?P<codepoint1>[0-9A-F]{4,6})' - +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?' - +r'\s*;\s*(?P<property>[a-zA-Z]+)', - line) - if not match: - continue - start = match.group('codepoint1') - end = match.group('codepoint2') - if not end: - end = start - for code_point in range(int(start, 16), int(end, 16)+1): - EAST_ASIAN_WIDTHS[code_point] = match.group('property') - -def ucs_symbol(code_point): - '''Return the UCS symbol string for a Unicode character.''' - if code_point < 0x10000: - return '<U{:04X}>'.format(code_point) - else: - return '<U{:08X}>'.format(code_point) +import unicode_utils def create_charmap_dictionary(file_name): '''Create a dictionary for all code points found in the CHARMAP @@ -217,10 +78,10 @@ def check_charmap(original_file_name, new_file_name): if ARGS.show_missing_characters: for key in sorted(set(ocharmap)-set(ncharmap)): print('removed: {:s} {:s} {:s}'.format( - ucs_symbol(key), + unicode_utils.ucs_symbol(key), ocharmap[key], - UNICODE_ATTRIBUTES[key]['name'] \ - if key in UNICODE_ATTRIBUTES else None)) + unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \ + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) print('------------------------------------------------------------') changed_charmap = {} for key in set(ocharmap).intersection(set(ncharmap)): @@ -231,21 +92,21 @@ def check_charmap(original_file_name, new_file_name): if ARGS.show_changed_characters: for key in sorted(changed_charmap): print('changed: {:s} {:s}->{:s} {:s}'.format( - ucs_symbol(key), + unicode_utils.ucs_symbol(key), changed_charmap[key][0], changed_charmap[key][1], - UNICODE_ATTRIBUTES[key]['name'] \ - if key in UNICODE_ATTRIBUTES else None)) + unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \ + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) print('------------------------------------------------------------') print('Total added characters in newly generated CHARMAP: %d' %len(set(ncharmap)-set(ocharmap))) if ARGS.show_added_characters: for key in sorted(set(ncharmap)-set(ocharmap)): print('added: {:s} {:s} {:s}'.format( - ucs_symbol(key), + unicode_utils.ucs_symbol(key), ncharmap[key], - UNICODE_ATTRIBUTES[key]['name'] \ - if key in UNICODE_ATTRIBUTES else None)) + unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \ + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) def create_width_dictionary(file_name): '''Create a dictionary for all code points found in the WIDTH @@ -290,20 +151,20 @@ def check_width(original_file_name, new_file_name): + 'i.e. these have width 1 now.)') if ARGS.show_missing_characters: for key in sorted(set(owidth)-set(nwidth)): - print('removed: {:s} '.format(ucs_symbol(key)) + print('removed: {:s} '.format(unicode_utils.ucs_symbol(key)) + '{:d} : '.format(owidth[key]) + 'eaw={:s} '.format( - EAST_ASIAN_WIDTHS[key] - if key in EAST_ASIAN_WIDTHS else None) + unicode_utils.EAST_ASIAN_WIDTHS[key] + if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None') + 'category={:2s} '.format( - UNICODE_ATTRIBUTES[key]['category'] - if key in UNICODE_ATTRIBUTES else None) + unicode_utils.UNICODE_ATTRIBUTES[key]['category'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'bidi={:3s} '.format( - UNICODE_ATTRIBUTES[key]['bidi'] - if key in UNICODE_ATTRIBUTES else None) + unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'name={:s}'.format( - UNICODE_ATTRIBUTES[key]['name'] - if key in UNICODE_ATTRIBUTES else None)) + unicode_utils.UNICODE_ATTRIBUTES[key]['name'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) print('------------------------------------------------------------') changed_width = {} for key in set(owidth).intersection(set(nwidth)): @@ -313,21 +174,21 @@ def check_width(original_file_name, new_file_name): %len(changed_width)) if ARGS.show_changed_characters: for key in sorted(changed_width): - print('changed width: {:s} '.format(ucs_symbol(key)) + print('changed width: {:s} '.format(unicode_utils.ucs_symbol(key)) + '{:d}->{:d} : '.format(changed_width[key][0], changed_width[key][1]) + 'eaw={:s} '.format( - EAST_ASIAN_WIDTHS[key] - if key in EAST_ASIAN_WIDTHS else None) + unicode_utils.EAST_ASIAN_WIDTHS[key] + if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None') + 'category={:2s} '.format( - UNICODE_ATTRIBUTES[key]['category'] - if key in UNICODE_ATTRIBUTES else None) + unicode_utils.UNICODE_ATTRIBUTES[key]['category'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'bidi={:3s} '.format( - UNICODE_ATTRIBUTES[key]['bidi'] - if key in UNICODE_ATTRIBUTES else None) + unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'name={:s}'.format( - UNICODE_ATTRIBUTES[key]['name'] - if key in UNICODE_ATTRIBUTES else None)) + unicode_utils.UNICODE_ATTRIBUTES[key]['name'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) print('------------------------------------------------------------') print('Total added characters in newly generated WIDTH: %d' %len(set(nwidth)-set(owidth))) @@ -335,20 +196,20 @@ def check_width(original_file_name, new_file_name): + 'i.e. these had width 1 before.)') if ARGS.show_added_characters: for key in sorted(set(nwidth)-set(owidth)): - print('added: {:s} '.format(ucs_symbol(key)) + print('added: {:s} '.format(unicode_utils.ucs_symbol(key)) + '{:d} : '.format(nwidth[key]) + 'eaw={:s} '.format( - EAST_ASIAN_WIDTHS[key] - if key in EAST_ASIAN_WIDTHS else None) + unicode_utils.EAST_ASIAN_WIDTHS[key] + if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None') + 'category={:2s} '.format( - UNICODE_ATTRIBUTES[key]['category'] - if key in UNICODE_ATTRIBUTES else None) + unicode_utils.UNICODE_ATTRIBUTES[key]['category'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'bidi={:3s} '.format( - UNICODE_ATTRIBUTES[key]['bidi'] - if key in UNICODE_ATTRIBUTES else None) + unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'name={:s}'.format( - UNICODE_ATTRIBUTES[key]['name'] - if key in UNICODE_ATTRIBUTES else None)) + unicode_utils.UNICODE_ATTRIBUTES[key]['name'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) if __name__ == "__main__": PARSER = argparse.ArgumentParser( @@ -392,8 +253,8 @@ if __name__ == "__main__": ARGS = PARSER.parse_args() if ARGS.unicode_data_file: - fill_attributes(ARGS.unicode_data_file) + unicode_utils.fill_attributes(ARGS.unicode_data_file) if ARGS.east_asian_width_file: - fill_east_asian_widths(ARGS.east_asian_width_file) + unicode_utils.fill_east_asian_widths(ARGS.east_asian_width_file) check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file) check_width(ARGS.old_utf8_file, ARGS.new_utf8_file) diff --git a/localedata/unicode-gen/utf8_gen.py b/localedata/unicode-gen/utf8_gen.py index f1b88f5b29..bc84c07617 100755 --- a/localedata/unicode-gen/utf8_gen.py +++ b/localedata/unicode-gen/utf8_gen.py @@ -29,6 +29,7 @@ It will output UTF-8 file import sys import re +import unicode_utils # Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book, # sections 3.11 and 4.4. @@ -49,13 +50,6 @@ JAMO_FINAL_SHORT_NAME = ( 'P', 'H' ) -def ucs_symbol(code_point): - '''Return the UCS symbol string for a Unicode character.''' - if code_point < 0x10000: - return '<U{:04X}>'.format(code_point) - else: - return '<U{:08X}>'.format(code_point) - def process_range(start, end, outfile, name): '''Writes a range of code points into the CHARMAP section of the output file @@ -78,7 +72,7 @@ def process_range(start, end, outfile, name): + JAMO_MEDIAL_SHORT_NAME[index2] \ + JAMO_FINAL_SHORT_NAME[index3] outfile.write('{:<11s} {:<12s} {:s}\n'.format( - ucs_symbol(i), convert_to_hex(i), + unicode_utils.ucs_symbol(i), convert_to_hex(i), hangul_syllable_name)) return # UnicodeData.txt file has contains code point ranges like this: @@ -95,14 +89,14 @@ def process_range(start, end, outfile, name): for i in range(int(start, 16), int(end, 16), 64 ): if i > (int(end, 16)-64): outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format( - ucs_symbol(i), - ucs_symbol(int(end,16)), + unicode_utils.ucs_symbol(i), + unicode_utils.ucs_symbol(int(end,16)), convert_to_hex(i), name)) break outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format( - ucs_symbol(i), - ucs_symbol(i+63), + unicode_utils.ucs_symbol(i), + unicode_utils.ucs_symbol(i+63), convert_to_hex(i), name)) @@ -168,7 +162,7 @@ def process_charmap(flines, outfile): # comments, so we keep these comment lines. outfile.write('%') outfile.write('{:<11s} {:<12s} {:s}\n'.format( - ucs_symbol(int(fields[0], 16)), + unicode_utils.ucs_symbol(int(fields[0], 16)), convert_to_hex(int(fields[0], 16)), fields[1])) @@ -230,7 +224,7 @@ def process_width(outfile, ulines, elines): for line in ulines: fields = line.split(";") if fields[4] == "NSM" or fields[2] == "Cf": - width_dict[int(fields[0], 16)] = ucs_symbol( + width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol( int(fields[0], 16)) + '\t0' for line in elines: @@ -238,7 +232,7 @@ def process_width(outfile, ulines, elines): # UnicodeData.txt: fields = line.split(";") if not '..' in fields[0]: - width_dict[int(fields[0], 16)] = ucs_symbol( + width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol( int(fields[0], 16)) + '\t2' else: code_points = fields[0].split("..") @@ -247,8 +241,8 @@ def process_width(outfile, ulines, elines): if key in width_dict: del width_dict[key] width_dict[int(code_points[0], 16)] = '{:s}...{:s}\t2'.format( - ucs_symbol(int(code_points[0], 16)), - ucs_symbol(int(code_points[1], 16))) + unicode_utils.ucs_symbol(int(code_points[0], 16)), + unicode_utils.ucs_symbol(int(code_points[1], 16))) for key in sorted(width_dict): outfile.write(width_dict[key]+'\n') |