diff options
Diffstat (limited to 'localedata/unicode-gen/utf8_compatibility.py')
-rwxr-xr-x | localedata/unicode-gen/utf8_compatibility.py | 260 |
1 files changed, 0 insertions, 260 deletions
diff --git a/localedata/unicode-gen/utf8_compatibility.py b/localedata/unicode-gen/utf8_compatibility.py deleted file mode 100755 index 66ec802f6e..0000000000 --- a/localedata/unicode-gen/utf8_compatibility.py +++ /dev/null @@ -1,260 +0,0 @@ -#!/usr/bin/python3 -# -*- coding: utf-8 -*- -# Copyright (C) 2014-2017 Free Software Foundation, Inc. -# This file is part of the GNU C Library. -# -# The GNU C Library is free software; you can redistribute it and/or -# modify it under the terms of the GNU Lesser General Public -# License as published by the Free Software Foundation; either -# version 2.1 of the License, or (at your option) any later version. -# -# The GNU C Library is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# Lesser General Public License for more details. -# -# You should have received a copy of the GNU Lesser General Public -# License along with the GNU C Library; if not, see -# <http://www.gnu.org/licenses/>. - -''' -This script is useful for checking backward compatibility of newly -generated UTF-8 file from utf8_gen.py script - -To see how this script is used, call it with the “-h” option: - - $ ./utf8_compatibility.py -h - … prints usage message … -''' - -import sys -import re -import argparse -import unicode_utils - -def create_charmap_dictionary(file_name): - '''Create a dictionary for all code points found in the CHARMAP - section of a file - ''' - with open(file_name, mode='r') as utf8_file: - charmap_dictionary = {} - for line in utf8_file: - if line.startswith('CHARMAP'): - break - for line in utf8_file: - if line.startswith('END CHARMAP'): - return charmap_dictionary - if line.startswith('%'): - continue - match = re.match( - r'^<U(?P<codepoint1>[0-9A-F]{4,8})>' - +r'(:?\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?' - +r'\s+(?P<hexutf8>(/x[0-9a-f]{2}){1,4})', - line) - if not match: - continue - codepoint1 = match.group('codepoint1') - codepoint2 = match.group('codepoint2') - if not codepoint2: - codepoint2 = codepoint1 - for i in range(int(codepoint1, 16), - int(codepoint2, 16) + 1): - charmap_dictionary[i] = match.group('hexutf8') - sys.stderr.write('No “CHARMAP” or no “END CHARMAP” found in %s\n' - %file_name) - exit(1) - -def check_charmap(original_file_name, new_file_name): - '''Report differences in the CHARMAP section between the old and the - new file - ''' - print('************************************************************') - print('Report on CHARMAP:') - ocharmap = create_charmap_dictionary(original_file_name) - ncharmap = create_charmap_dictionary(new_file_name) - print('------------------------------------------------------------') - print('Total removed characters in newly generated CHARMAP: %d' - %len(set(ocharmap)-set(ncharmap))) - if ARGS.show_missing_characters: - for key in sorted(set(ocharmap)-set(ncharmap)): - print('removed: {:s} {:s} {:s}'.format( - unicode_utils.ucs_symbol(key), - ocharmap[key], - unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \ - if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) - print('------------------------------------------------------------') - changed_charmap = {} - for key in set(ocharmap).intersection(set(ncharmap)): - if ocharmap[key] != ncharmap[key]: - changed_charmap[key] = (ocharmap[key], ncharmap[key]) - print('Total changed characters in newly generated CHARMAP: %d' - %len(changed_charmap)) - if ARGS.show_changed_characters: - for key in sorted(changed_charmap): - print('changed: {:s} {:s}->{:s} {:s}'.format( - unicode_utils.ucs_symbol(key), - changed_charmap[key][0], - changed_charmap[key][1], - unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \ - if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) - print('------------------------------------------------------------') - print('Total added characters in newly generated CHARMAP: %d' - %len(set(ncharmap)-set(ocharmap))) - if ARGS.show_added_characters: - for key in sorted(set(ncharmap)-set(ocharmap)): - print('added: {:s} {:s} {:s}'.format( - unicode_utils.ucs_symbol(key), - ncharmap[key], - unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \ - if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) - -def create_width_dictionary(file_name): - '''Create a dictionary for all code points found in the WIDTH - section of a file - ''' - with open(file_name, mode='r') as utf8_file: - width_dictionary = {} - for line in utf8_file: - if line.startswith('WIDTH'): - break - for line in utf8_file: - if line.startswith('END WIDTH'): - return width_dictionary - match = re.match( - r'^<U(?P<codepoint1>[0-9A-F]{4,8})>' - +r'(:?\.\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?' - +r'\s+(?P<width>[02])', - line) - if not match: - continue - codepoint1 = match.group('codepoint1') - codepoint2 = match.group('codepoint2') - if not codepoint2: - codepoint2 = codepoint1 - for i in range(int(codepoint1, 16), - int(codepoint2, 16) + 1): - width_dictionary[i] = int(match.group('width')) - sys.stderr.write('No “WIDTH” or no “END WIDTH” found in %s\n' %file) - -def check_width(original_file_name, new_file_name): - '''Report differences in the WIDTH section between the old and the new - file - ''' - print('************************************************************') - print('Report on WIDTH:') - owidth = create_width_dictionary(original_file_name) - nwidth = create_width_dictionary(new_file_name) - print('------------------------------------------------------------') - print('Total removed characters in newly generated WIDTH: %d' - %len(set(owidth)-set(nwidth))) - print('(Characters not in WIDTH get width 1 by default, ' - + 'i.e. these have width 1 now.)') - if ARGS.show_missing_characters: - for key in sorted(set(owidth)-set(nwidth)): - print('removed: {:s} '.format(unicode_utils.ucs_symbol(key)) - + '{:d} : '.format(owidth[key]) - + 'eaw={:s} '.format( - unicode_utils.EAST_ASIAN_WIDTHS[key] - if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None') - + 'category={:2s} '.format( - unicode_utils.UNICODE_ATTRIBUTES[key]['category'] - if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') - + 'bidi={:3s} '.format( - unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] - if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') - + 'name={:s}'.format( - unicode_utils.UNICODE_ATTRIBUTES[key]['name'] - if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) - print('------------------------------------------------------------') - changed_width = {} - for key in set(owidth).intersection(set(nwidth)): - if owidth[key] != nwidth[key]: - changed_width[key] = (owidth[key], nwidth[key]) - print('Total changed characters in newly generated WIDTH: %d' - %len(changed_width)) - if ARGS.show_changed_characters: - for key in sorted(changed_width): - print('changed width: {:s} '.format(unicode_utils.ucs_symbol(key)) - + '{:d}->{:d} : '.format(changed_width[key][0], - changed_width[key][1]) - + 'eaw={:s} '.format( - unicode_utils.EAST_ASIAN_WIDTHS[key] - if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None') - + 'category={:2s} '.format( - unicode_utils.UNICODE_ATTRIBUTES[key]['category'] - if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') - + 'bidi={:3s} '.format( - unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] - if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') - + 'name={:s}'.format( - unicode_utils.UNICODE_ATTRIBUTES[key]['name'] - if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) - print('------------------------------------------------------------') - print('Total added characters in newly generated WIDTH: %d' - %len(set(nwidth)-set(owidth))) - print('(Characters not in WIDTH get width 1 by default, ' - + 'i.e. these had width 1 before.)') - if ARGS.show_added_characters: - for key in sorted(set(nwidth)-set(owidth)): - print('added: {:s} '.format(unicode_utils.ucs_symbol(key)) - + '{:d} : '.format(nwidth[key]) - + 'eaw={:s} '.format( - unicode_utils.EAST_ASIAN_WIDTHS[key] - if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None') - + 'category={:2s} '.format( - unicode_utils.UNICODE_ATTRIBUTES[key]['category'] - if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') - + 'bidi={:3s} '.format( - unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] - if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') - + 'name={:s}'.format( - unicode_utils.UNICODE_ATTRIBUTES[key]['name'] - if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) - -if __name__ == "__main__": - PARSER = argparse.ArgumentParser( - description=''' - Compare the contents of LC_CTYPE in two files and check for errors. - ''') - PARSER.add_argument( - '-o', '--old_utf8_file', - nargs='?', - required=True, - type=str, - help='The old UTF-8 file.') - PARSER.add_argument( - '-n', '--new_utf8_file', - nargs='?', - required=True, - type=str, - help='The new UTF-8 file.') - PARSER.add_argument( - '-u', '--unicode_data_file', - nargs='?', - type=str, - help='The UnicodeData.txt file to read.') - PARSER.add_argument( - '-e', '--east_asian_width_file', - nargs='?', - type=str, - help='The EastAsianWidth.txt file to read.') - PARSER.add_argument( - '-a', '--show_added_characters', - action='store_true', - help='Show characters which were added in detail.') - PARSER.add_argument( - '-m', '--show_missing_characters', - action='store_true', - help='Show characters which were removed in detail.') - PARSER.add_argument( - '-c', '--show_changed_characters', - action='store_true', - help='Show characters whose width was changed in detail.') - ARGS = PARSER.parse_args() - - if ARGS.unicode_data_file: - unicode_utils.fill_attributes(ARGS.unicode_data_file) - if ARGS.east_asian_width_file: - unicode_utils.fill_east_asian_widths(ARGS.east_asian_width_file) - check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file) - check_width(ARGS.old_utf8_file, ARGS.new_utf8_file) |