diff options
author | Carlos O'Donell <carlos@systemhalted.org> | 2015-12-09 22:27:41 -0500 |
---|---|---|
committer | Carlos O'Donell <carlos@systemhalted.org> | 2015-12-09 22:52:13 -0500 |
commit | dd8e8e547647bf7a3f6feb816a848a846feeaf14 (patch) | |
tree | a2565747c02ddaa9b178a5aa9de6fa42aa5ae979 /localedata/unicode-gen/utf8_compatibility.py | |
parent | 40b59cace2fd5e5aa04367073a54efc995059376 (diff) | |
download | glibc-dd8e8e547647bf7a3f6feb816a848a846feeaf14.tar glibc-dd8e8e547647bf7a3f6feb816a848a846feeaf14.tar.gz glibc-dd8e8e547647bf7a3f6feb816a848a846feeaf14.tar.bz2 glibc-dd8e8e547647bf7a3f6feb816a848a846feeaf14.zip |
Update transliteration support to Unicode 7.0.0.
The transliteration files are now autogenerated from upstream Unicode
data.
Diffstat (limited to 'localedata/unicode-gen/utf8_compatibility.py')
-rwxr-xr-x | localedata/unicode-gen/utf8_compatibility.py | 217 |
1 files changed, 39 insertions, 178 deletions
diff --git a/localedata/unicode-gen/utf8_compatibility.py b/localedata/unicode-gen/utf8_compatibility.py index b84a1eb3de..3b7a94ccc9 100755 --- a/localedata/unicode-gen/utf8_compatibility.py +++ b/localedata/unicode-gen/utf8_compatibility.py @@ -30,146 +30,7 @@ To see how this script is used, call it with the “-h” option: import sys import re import argparse - -# Dictionary holding the entire contents of the UnicodeData.txt file -# -# Contents of this dictionary look like this: -# -# {0: {'category': 'Cc', -# 'title': None, -# 'digit': '', -# 'name': '<control>', -# 'bidi': 'BN', -# 'combining': '0', -# 'comment': '', -# 'oldname': 'NULL', -# 'decomposition': '', -# 'upper': None, -# 'mirrored': 'N', -# 'lower': None, -# 'decdigit': '', -# 'numeric': ''}, -# … -# } -UNICODE_ATTRIBUTES = {} - -# Dictionary holding the entire contents of the EastAsianWidths.txt file -# -# Contents of this dictionary look like this: -# -# {0: 'N', … , 45430: 'W', …} -EAST_ASIAN_WIDTHS = {} - -def fill_attribute(code_point, fields): - '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields. - - One entry in the UNICODE_ATTRIBUTES dictionary represents one line - in the UnicodeData.txt file. - - ''' - UNICODE_ATTRIBUTES[code_point] = { - 'name': fields[1], # Character name - 'category': fields[2], # General category - 'combining': fields[3], # Canonical combining classes - 'bidi': fields[4], # Bidirectional category - 'decomposition': fields[5], # Character decomposition mapping - 'decdigit': fields[6], # Decimal digit value - 'digit': fields[7], # Digit value - 'numeric': fields[8], # Numeric value - 'mirrored': fields[9], # mirrored - 'oldname': fields[10], # Old Unicode 1.0 name - 'comment': fields[11], # comment - # Uppercase mapping - 'upper': int(fields[12], 16) if fields[12] else None, - # Lowercase mapping - 'lower': int(fields[13], 16) if fields[13] else None, - # Titlecase mapping - 'title': int(fields[14], 16) if fields[14] else None, - } - -def fill_attributes(filename): - '''Stores the entire contents of the UnicodeData.txt file - in the UNICODE_ATTRIBUTES dictionary. - - A typical line for a single code point in UnicodeData.txt looks - like this: - - 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; - - Code point ranges are indicated by pairs of lines like this: - - 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; - 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; - ''' - with open(filename, mode='r') as unicode_data_file: - fields_start = [] - for line in unicode_data_file: - fields = line.strip().split(';') - if len(fields) != 15: - sys.stderr.write( - 'short line in file "%(f)s": %(l)s\n' %{ - 'f': filename, 'l': line}) - exit(1) - if fields[2] == 'Cs': - # Surrogates are UTF-16 artefacts, - # not real characters. Ignore them. - fields_start = [] - continue - if fields[1].endswith(', First>'): - fields_start = fields - fields_start[1] = fields_start[1].split(',')[0][1:] - continue - if fields[1].endswith(', Last>'): - fields[1] = fields[1].split(',')[0][1:] - if fields[1:] != fields_start[1:]: - sys.stderr.write( - 'broken code point range in file "%(f)s": %(l)s\n' %{ - 'f': filename, 'l': line}) - exit(1) - for code_point in range( - int(fields_start[0], 16), - int(fields[0], 16)+1): - fill_attribute(code_point, fields) - fields_start = [] - continue - fill_attribute(int(fields[0], 16), fields) - fields_start = [] - -def fill_east_asian_widths(filename): - '''Stores the entire contents of the EastAsianWidths.txt file - in the EAST_ASIAN_WIDTHS dictionary. - - Lines in EastAsianWidths.txt are either a code point range like - this: - - 9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF> - - or a single code point like this: - - A015;W # Lm YI SYLLABLE WU - ''' - with open(filename, mode='r') as east_asian_widths_file: - for line in east_asian_widths_file: - match = re.match( - r'^(?P<codepoint1>[0-9A-F]{4,6})' - +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?' - +r'\s*;\s*(?P<property>[a-zA-Z]+)', - line) - if not match: - continue - start = match.group('codepoint1') - end = match.group('codepoint2') - if not end: - end = start - for code_point in range(int(start, 16), int(end, 16)+1): - EAST_ASIAN_WIDTHS[code_point] = match.group('property') - -def ucs_symbol(code_point): - '''Return the UCS symbol string for a Unicode character.''' - if code_point < 0x10000: - return '<U{:04X}>'.format(code_point) - else: - return '<U{:08X}>'.format(code_point) +import unicode_utils def create_charmap_dictionary(file_name): '''Create a dictionary for all code points found in the CHARMAP @@ -217,10 +78,10 @@ def check_charmap(original_file_name, new_file_name): if ARGS.show_missing_characters: for key in sorted(set(ocharmap)-set(ncharmap)): print('removed: {:s} {:s} {:s}'.format( - ucs_symbol(key), + unicode_utils.ucs_symbol(key), ocharmap[key], - UNICODE_ATTRIBUTES[key]['name'] \ - if key in UNICODE_ATTRIBUTES else None)) + unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \ + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) print('------------------------------------------------------------') changed_charmap = {} for key in set(ocharmap).intersection(set(ncharmap)): @@ -231,21 +92,21 @@ def check_charmap(original_file_name, new_file_name): if ARGS.show_changed_characters: for key in sorted(changed_charmap): print('changed: {:s} {:s}->{:s} {:s}'.format( - ucs_symbol(key), + unicode_utils.ucs_symbol(key), changed_charmap[key][0], changed_charmap[key][1], - UNICODE_ATTRIBUTES[key]['name'] \ - if key in UNICODE_ATTRIBUTES else None)) + unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \ + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) print('------------------------------------------------------------') print('Total added characters in newly generated CHARMAP: %d' %len(set(ncharmap)-set(ocharmap))) if ARGS.show_added_characters: for key in sorted(set(ncharmap)-set(ocharmap)): print('added: {:s} {:s} {:s}'.format( - ucs_symbol(key), + unicode_utils.ucs_symbol(key), ncharmap[key], - UNICODE_ATTRIBUTES[key]['name'] \ - if key in UNICODE_ATTRIBUTES else None)) + unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \ + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) def create_width_dictionary(file_name): '''Create a dictionary for all code points found in the WIDTH @@ -290,20 +151,20 @@ def check_width(original_file_name, new_file_name): + 'i.e. these have width 1 now.)') if ARGS.show_missing_characters: for key in sorted(set(owidth)-set(nwidth)): - print('removed: {:s} '.format(ucs_symbol(key)) + print('removed: {:s} '.format(unicode_utils.ucs_symbol(key)) + '{:d} : '.format(owidth[key]) + 'eaw={:s} '.format( - EAST_ASIAN_WIDTHS[key] - if key in EAST_ASIAN_WIDTHS else None) + unicode_utils.EAST_ASIAN_WIDTHS[key] + if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None') + 'category={:2s} '.format( - UNICODE_ATTRIBUTES[key]['category'] - if key in UNICODE_ATTRIBUTES else None) + unicode_utils.UNICODE_ATTRIBUTES[key]['category'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'bidi={:3s} '.format( - UNICODE_ATTRIBUTES[key]['bidi'] - if key in UNICODE_ATTRIBUTES else None) + unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'name={:s}'.format( - UNICODE_ATTRIBUTES[key]['name'] - if key in UNICODE_ATTRIBUTES else None)) + unicode_utils.UNICODE_ATTRIBUTES[key]['name'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) print('------------------------------------------------------------') changed_width = {} for key in set(owidth).intersection(set(nwidth)): @@ -313,21 +174,21 @@ def check_width(original_file_name, new_file_name): %len(changed_width)) if ARGS.show_changed_characters: for key in sorted(changed_width): - print('changed width: {:s} '.format(ucs_symbol(key)) + print('changed width: {:s} '.format(unicode_utils.ucs_symbol(key)) + '{:d}->{:d} : '.format(changed_width[key][0], changed_width[key][1]) + 'eaw={:s} '.format( - EAST_ASIAN_WIDTHS[key] - if key in EAST_ASIAN_WIDTHS else None) + unicode_utils.EAST_ASIAN_WIDTHS[key] + if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None') + 'category={:2s} '.format( - UNICODE_ATTRIBUTES[key]['category'] - if key in UNICODE_ATTRIBUTES else None) + unicode_utils.UNICODE_ATTRIBUTES[key]['category'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'bidi={:3s} '.format( - UNICODE_ATTRIBUTES[key]['bidi'] - if key in UNICODE_ATTRIBUTES else None) + unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'name={:s}'.format( - UNICODE_ATTRIBUTES[key]['name'] - if key in UNICODE_ATTRIBUTES else None)) + unicode_utils.UNICODE_ATTRIBUTES[key]['name'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) print('------------------------------------------------------------') print('Total added characters in newly generated WIDTH: %d' %len(set(nwidth)-set(owidth))) @@ -335,20 +196,20 @@ def check_width(original_file_name, new_file_name): + 'i.e. these had width 1 before.)') if ARGS.show_added_characters: for key in sorted(set(nwidth)-set(owidth)): - print('added: {:s} '.format(ucs_symbol(key)) + print('added: {:s} '.format(unicode_utils.ucs_symbol(key)) + '{:d} : '.format(nwidth[key]) + 'eaw={:s} '.format( - EAST_ASIAN_WIDTHS[key] - if key in EAST_ASIAN_WIDTHS else None) + unicode_utils.EAST_ASIAN_WIDTHS[key] + if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None') + 'category={:2s} '.format( - UNICODE_ATTRIBUTES[key]['category'] - if key in UNICODE_ATTRIBUTES else None) + unicode_utils.UNICODE_ATTRIBUTES[key]['category'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'bidi={:3s} '.format( - UNICODE_ATTRIBUTES[key]['bidi'] - if key in UNICODE_ATTRIBUTES else None) + unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'name={:s}'.format( - UNICODE_ATTRIBUTES[key]['name'] - if key in UNICODE_ATTRIBUTES else None)) + unicode_utils.UNICODE_ATTRIBUTES[key]['name'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) if __name__ == "__main__": PARSER = argparse.ArgumentParser( @@ -392,8 +253,8 @@ if __name__ == "__main__": ARGS = PARSER.parse_args() if ARGS.unicode_data_file: - fill_attributes(ARGS.unicode_data_file) + unicode_utils.fill_attributes(ARGS.unicode_data_file) if ARGS.east_asian_width_file: - fill_east_asian_widths(ARGS.east_asian_width_file) + unicode_utils.fill_east_asian_widths(ARGS.east_asian_width_file) check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file) check_width(ARGS.old_utf8_file, ARGS.new_utf8_file) |