diff options
author | Mike FABIAN <mfabian@redhat.com> | 2017-08-18 10:12:29 +0200 |
---|---|---|
committer | Mike FABIAN <mfabian@redhat.com> | 2017-09-06 12:37:49 +0200 |
commit | af83ed5c4647bda196fc1a7efebbe8019aa83f4a (patch) | |
tree | 222b4c599fc16758fc87b85220acd38e2ba5d56c /localedata | |
parent | 4f3647e46e3f645c6516faa299efc6e89d520d7b (diff) | |
download | glibc-af83ed5c4647bda196fc1a7efebbe8019aa83f4a.tar glibc-af83ed5c4647bda196fc1a7efebbe8019aa83f4a.tar.gz glibc-af83ed5c4647bda196fc1a7efebbe8019aa83f4a.tar.bz2 glibc-af83ed5c4647bda196fc1a7efebbe8019aa83f4a.zip |
Write all ranges of neighbouring characters with the same width using the range notation in charmaps/UTF-8
Writing ranges of neighbouring characters with the same with like this
<U000E0100>...<U000E01EF> 0
in charmaps/UTF-8 is more efficient than writing many single character lines
like:
<U000E0100> 0
<U000E0101> 0
...
[BZ #21750]
* unicode-gen/utf8_gen.py: Write all ranges of neighbouring characters
with the same width using the range notation in charmaps/UTF-8.
Diffstat (limited to 'localedata')
-rwxr-xr-x | localedata/unicode-gen/utf8_gen.py | 51 |
1 files changed, 38 insertions, 13 deletions
diff --git a/localedata/unicode-gen/utf8_gen.py b/localedata/unicode-gen/utf8_gen.py index 1563aa11d2..52c79e83c1 100755 --- a/localedata/unicode-gen/utf8_gen.py +++ b/localedata/unicode-gen/utf8_gen.py @@ -199,7 +199,7 @@ def write_header_charmap(outfile): def write_header_width(outfile): '''Writes the header on top of the WIDTH section to the output file''' - outfile.write('% Character width according to Unicode 7.0.0.\n') + outfile.write('% Character width according to Unicode 10.0.0.\n') outfile.write('% - Default width is 1.\n') outfile.write('% - Double-width characters have width 2; generated from\n') outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n') @@ -229,27 +229,52 @@ def process_width(outfile, ulines, elines): code_points = fields[0].split("..") for key in range(int(code_points[0], 16), int(code_points[1], 16)+1): - width_dict[key] = unicode_utils.ucs_symbol(key) + '\t2' + width_dict[key] = 2 for line in ulines: fields = line.split(";") if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"): - width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol( - int(fields[0], 16)) + '\t0' + width_dict[int(fields[0], 16)] = 0 # handle special cases for compatibility - for key in list(range(0x1160, 0x1200)) + list(range(0x3248, 0x3250)) + \ - list(range(0x4DC0, 0x4E00)) + list((0x00AD,)): + for key in list((0x00AD,)): + # https://www.cs.tut.fi/~jkorpela/shy.html if key in width_dict: del width_dict[key] - width_dict[0x1160] = '{:s}...{:s}\t0'.format( - unicode_utils.ucs_symbol(0x1160), unicode_utils.ucs_symbol(0x11FF)) - width_dict[0x3248] = '{:s}...{:s}\t2'.format( - unicode_utils.ucs_symbol(0x3248), unicode_utils.ucs_symbol(0x324F)) - width_dict[0x4DC0] = '{:s}...{:s}\t2'.format( - unicode_utils.ucs_symbol(0x4DC0), unicode_utils.ucs_symbol(0x4DFF)) + for key in list(range(0x1160, 0x1200)): + width_dict[key] = 0 + for key in list(range(0x3248, 0x3250)): + # These are “A” which means we can decide whether to treat them + # as “W” or “N” based on context: + # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html + # For us, “W” seems better. + width_dict[key] = 2 + for key in list(range(0x4DC0, 0x4E00)): + width_dict[key] = 2 + same_width_lists = [] + current_width_list = [] for key in sorted(width_dict): - outfile.write(width_dict[key]+'\n') + if not current_width_list: + current_width_list = [key] + elif (key == current_width_list[-1] + 1 + and width_dict[key] == width_dict[current_width_list[0]]): + current_width_list.append(key) + else: + same_width_lists.append(current_width_list) + current_width_list = [key] + if current_width_list: + same_width_lists.append(current_width_list) + + for same_width_list in same_width_lists: + if len(same_width_list) == 1: + outfile.write('{:s}\t{:d}\n'.format( + unicode_utils.ucs_symbol(same_width_list[0]), + width_dict[same_width_list[0]])) + else: + outfile.write('{:s}...{:s}\t{:d}\n'.format( + unicode_utils.ucs_symbol(same_width_list[0]), + unicode_utils.ucs_symbol(same_width_list[-1]), + width_dict[same_width_list[0]])) if __name__ == "__main__": if len(sys.argv) < 3: |