aboutsummaryrefslogtreecommitdiff
path: root/localedata/unicode-gen/gen_translit_compat.py
diff options
context:
space:
mode:
authorCarlos O'Donell <carlos@systemhalted.org>2015-12-09 22:27:41 -0500
committerCarlos O'Donell <carlos@systemhalted.org>2015-12-09 22:52:13 -0500
commitdd8e8e547647bf7a3f6feb816a848a846feeaf14 (patch)
treea2565747c02ddaa9b178a5aa9de6fa42aa5ae979 /localedata/unicode-gen/gen_translit_compat.py
parent40b59cace2fd5e5aa04367073a54efc995059376 (diff)
downloadglibc-dd8e8e547647bf7a3f6feb816a848a846feeaf14.tar
glibc-dd8e8e547647bf7a3f6feb816a848a846feeaf14.tar.gz
glibc-dd8e8e547647bf7a3f6feb816a848a846feeaf14.tar.bz2
glibc-dd8e8e547647bf7a3f6feb816a848a846feeaf14.zip
Update transliteration support to Unicode 7.0.0.
The transliteration files are now autogenerated from upstream Unicode data.
Diffstat (limited to 'localedata/unicode-gen/gen_translit_compat.py')
-rw-r--r--localedata/unicode-gen/gen_translit_compat.py326
1 files changed, 326 insertions, 0 deletions
diff --git a/localedata/unicode-gen/gen_translit_compat.py b/localedata/unicode-gen/gen_translit_compat.py
new file mode 100644
index 0000000000..0e824a877e
--- /dev/null
+++ b/localedata/unicode-gen/gen_translit_compat.py
@@ -0,0 +1,326 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+#
+# Generate a translit_compat file from a UnicodeData file.
+# Copyright (C) 2015 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <http://www.gnu.org/licenses/>.
+
+'''
+Generate a translit_compat file from UnicodeData.txt
+
+To see how this script is used, call it with the “-h” option:
+
+ $ ./gen_translit_compat -h
+ … prints usage message …
+'''
+
+import argparse
+import time
+import unicode_utils
+
+def read_input_file(filename):
+ '''Reads the original glibc translit_compat file to get the
+ original head and tail.
+
+ We want to replace only the part of the file between
+ “translit_start” and “translit_end”
+ '''
+ head = tail = ''
+ with open(filename, mode='r') as translit_file:
+ for line in translit_file:
+ head = head + line
+ if line.startswith('translit_start'):
+ break
+ for line in translit_file:
+ if line.startswith('translit_end'):
+ tail = line
+ break
+ for line in translit_file:
+ tail = tail + line
+ return (head, tail)
+
+def output_head(translit_file, unicode_version, head=''):
+ '''Write the header of the output file, i.e. the part of the file
+ before the “translit_start” line.
+ '''
+ if ARGS.input_file and head:
+ translit_file.write(head)
+ else:
+ translit_file.write('escape_char /\n')
+ translit_file.write('comment_char %\n')
+ translit_file.write('\n')
+ translit_file.write('% Transliterations of compatibility characters ')
+ translit_file.write('and ligatures.\n')
+ translit_file.write('% Generated automatically from UnicodeData.txt '
+ + 'by gen_translit_compat.py '
+ + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
+ + 'for Unicode {:s}.\n'.format(unicode_version))
+ translit_file.write('\n')
+ translit_file.write('LC_CTYPE\n')
+ translit_file.write('\n')
+ translit_file.write('translit_start\n')
+
+def output_tail(translit_file, tail=''):
+ '''Write the tail of the output file'''
+ if ARGS.input_file and tail:
+ translit_file.write(tail)
+ else:
+ translit_file.write('translit_end\n')
+ translit_file.write('\n')
+ translit_file.write('END LC_CTYPE\n')
+
+def compatibility_decompose(code_point):
+ '''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
+
+ “The compatibility decomposition is formed by recursively applying
+ the canonical and compatibility mappings, then applying the
+ Canonical Ordering Algorithm.”
+
+ We don’t do the canonical decomposition here because this is
+ done in gen_translit_combining.py to generate translit_combining.
+
+ And we ignore some of the possible compatibility formatting tags
+ here. Some of them are used in other translit_* files, not
+ translit_compat:
+
+ <font>: translit_font
+ <circle>: translit_circle
+ <wide>: translit_wide
+ <narrow>: translit_narrow
+ <square>: translit_cjk_compat
+ <fraction>: translit_fraction
+
+ And we ignore
+
+ <noBreak>, <initial>, <medial>, <final>, <isolated>
+
+ because they seem to be not useful for transliteration.
+ '''
+ decomposition = unicode_utils.UNICODE_ATTRIBUTES[
+ code_point]['decomposition']
+ compatibility_tags = (
+ '<compat>', '<super>', '<sub>', '<vertical>')
+ for compatibility_tag in compatibility_tags:
+ if decomposition.startswith(compatibility_tag):
+ decomposition = decomposition[len(compatibility_tag)+1:]
+ decomposed_code_points = [int(x, 16)
+ for x in decomposition.split(' ')]
+ if (len(decomposed_code_points) > 1
+ and decomposed_code_points[0] == 0x0020
+ and decomposed_code_points[1] >= 0x0300
+ and decomposed_code_points[1] <= 0x03FF):
+ # Decomposes into a space followed by a combining character.
+ # This is not useful fo transliteration.
+ return []
+ else:
+ return_value = []
+ for index in range(0, len(decomposed_code_points)):
+ cd_code_points = compatibility_decompose(
+ decomposed_code_points[index])
+ if cd_code_points:
+ return_value += cd_code_points
+ else:
+ return_value += [decomposed_code_points[index]]
+ return return_value
+ return []
+
+def special_decompose(code_point_list):
+ '''
+ Decompositions which are not in UnicodeData.txt at all but which
+ were used in the original translit_compat file in glibc and
+ which seem to make sense. I want to keep the update of
+ translit_compat close to the spirit of the original file,
+ therefore I added this special decomposition rules here.
+ '''
+ special_decompose_dict = {
+ (0x03BC,): [0x0075], # μ → u
+ (0x02BC,): [0x0027], # ʼ → '
+ }
+ if tuple(code_point_list) in special_decompose_dict:
+ return special_decompose_dict[tuple(code_point_list)]
+ else:
+ return code_point_list
+
+def special_ligature_decompose(code_point):
+ '''
+ Decompositions for ligatures which are not in UnicodeData.txt at
+ all but which were used in the original translit_compat file in
+ glibc and which seem to make sense. I want to keep the update of
+ translit_compat close to the spirit of the original file,
+ therefore I added these special ligature decomposition rules here.
+
+ '''
+ special_ligature_decompose_dict = {
+ 0x00E6: [0x0061, 0x0065], # æ → ae
+ 0x00C6: [0x0041, 0x0045], # Æ → AE
+ # These following 5 special ligature decompositions were
+ # in the original glibc/localedata/locales/translit_compat file
+ 0x0152: [0x004F, 0x0045], # Œ → OE
+ 0x0153: [0x006F, 0x0065], # œ → oe
+ 0x05F0: [0x05D5, 0x05D5], # װ → וו
+ 0x05F1: [0x05D5, 0x05D9], # ױ → וי
+ 0x05F2: [0x05D9, 0x05D9], # ײ → יי
+ # The following special ligature decompositions were
+ # not in the original glibc/localedata/locales/translit_compat file
+ # U+04A4 CYRILLIC CAPITAL LIGATURE EN GHE
+ # → U+041D CYRILLIC CAPITAL LETTER EN,
+ # U+0413 CYRILLIC CAPITAL LETTER GHE
+ 0x04A4: [0x041D, 0x0413], # Ҥ → НГ
+ # U+04A5 CYRILLIC SMALL LIGATURE EN GHE
+ # → U+043D CYRILLIC SMALL LETTER EN,
+ # U+0433 CYRILLIC SMALL LETTER GHE
+ 0x04A5: [0x043D, 0x0433], # ҥ → нг
+ # U+04B4 CYRILLIC CAPITAL LIGATURE TE TSE
+ # → U+0422 CYRILLIC CAPITAL LETTER TE,
+ # U+0426 CYRILLIC CAPITAL LETTER TSE
+ 0x04B4: [0x0422, 0x0426], # Ҵ → ТЦ
+ # U+04B5 CYRILLIC SMALL LIGATURE TE TSE
+ # → U+0442 CYRILLIC SMALL LETTER TE,
+ # U+0446 CYRILLIC SMALL LETTER TSE
+ 0x04B5: [0x0442, 0x0446], # ҵ → тц
+ # U+04d4 CYRILLIC CAPITAL LIGATURE A IE
+ # → U+0410 CYRILLIC CAPITAL LETTER A
+ # U+0415;CYRILLIC CAPITAL LETTER IE
+ 0x04D4: [0x0410, 0x0415], # Ӕ → АЕ
+ # U+04D5 CYRILLIC SMALL LIGATURE A IE
+ # → U+0430 CYRILLIC SMALL LETTER A,
+ # U+0435 CYRILLIC SMALL LETTER IE
+ 0x04D5: [0x0430, 0x0435], # ӕ → ае
+ # I am not sure what to do with the following ligatures
+ # maybe it makes no sense to decompose them:
+ # U+0616 ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH
+ # U+06d6 ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA
+ # U+06d7 ARABIC SMALL HIGH LIGATURE QAF WITH LAM WITH ALEF MAKSURA
+ # U+fdfd ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
+ # U+fe20 COMBINING LIGATURE LEFT HALF
+ # U+fe21 COMBINING LIGATURE RIGHT HALF
+ # U+fe27 COMBINING LIGATURE LEFT HALF BELOW
+ # U+fe28 COMBINING LIGATURE RIGHT HALF BELOW
+ # U+11176 MAHAJANI LIGATURE SHRI
+ # U+1f670 SCRIPT LIGATURE ET ORNAMENT
+ # U+1f671 HEAVY SCRIPT LIGATURE ET ORNAMENT
+ # U+1f672 LIGATURE OPEN ET ORNAMENT
+ # U+1f673 HEAVY LIGATURE OPEN ET ORNAMENT
+ }
+ if code_point in special_ligature_decompose_dict:
+ return special_ligature_decompose_dict[code_point]
+ else:
+ return [code_point]
+
+def output_transliteration(translit_file):
+ '''Write the new transliteration to the output file'''
+ translit_file.write('\n')
+ for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
+ name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
+ decomposed_code_points = [compatibility_decompose(code_point)]
+ if not decomposed_code_points[0]:
+ if special_decompose([code_point]) != [code_point]:
+ decomposed_code_points[0] = special_decompose([code_point])
+ else:
+ special_decomposed_code_points = []
+ while True:
+ special_decomposed_code_points = special_decompose(
+ decomposed_code_points[-1])
+ if (special_decomposed_code_points
+ != decomposed_code_points[-1]):
+ decomposed_code_points.append(
+ special_decomposed_code_points)
+ continue
+ special_decomposed_code_points = []
+ for decomposed_code_point in decomposed_code_points[-1]:
+ special_decomposed_code_points += special_decompose(
+ [decomposed_code_point])
+ if (special_decomposed_code_points
+ == decomposed_code_points[-1]):
+ break
+ decomposed_code_points.append(
+ special_decomposed_code_points)
+ if decomposed_code_points[0]:
+ translit_file.write('% {:s}\n'.format(name))
+ translit_file.write('{:s} '.format(
+ unicode_utils.ucs_symbol(code_point)))
+ for index in range(0, len(decomposed_code_points)):
+ if index > 0:
+ translit_file.write(';')
+ translit_file.write('"')
+ for decomposed_code_point in decomposed_code_points[index]:
+ translit_file.write('{:s}'.format(
+ unicode_utils.ucs_symbol(decomposed_code_point)))
+ translit_file.write('"')
+ translit_file.write('\n')
+ elif 'LIGATURE' in name and 'ARABIC' not in name:
+ decomposed_code_points = special_ligature_decompose(code_point)
+ if decomposed_code_points[0] != code_point:
+ translit_file.write('% {:s}\n'.format(name))
+ translit_file.write('{:s} '.format(
+ unicode_utils.ucs_symbol(code_point)))
+ translit_file.write('"')
+ for decomposed_code_point in decomposed_code_points:
+ translit_file.write('{:s}'.format(
+ unicode_utils.ucs_symbol(decomposed_code_point)))
+ translit_file.write('"')
+ translit_file.write('\n')
+ else:
+ print('Warning: unhandled ligature: {:x} {:s}'.format(
+ code_point, name))
+ translit_file.write('\n')
+
+if __name__ == "__main__":
+ PARSER = argparse.ArgumentParser(
+ description='''
+ Generate a translit_compat file from UnicodeData.txt.
+ ''')
+ PARSER.add_argument(
+ '-u', '--unicode_data_file',
+ nargs='?',
+ type=str,
+ default='UnicodeData.txt',
+ help=('The UnicodeData.txt file to read, '
+ + 'default: %(default)s'))
+ PARSER.add_argument(
+ '-i', '--input_file',
+ nargs='?',
+ type=str,
+ help=''' The original glibc/localedata/locales/translit_compat
+ file.''')
+ PARSER.add_argument(
+ '-o', '--output_file',
+ nargs='?',
+ type=str,
+ default='translit_compat.new',
+ help='''The new translit_compat file, default: %(default)s. If the
+ original glibc/localedata/locales/translit_compat file has
+ been given as an option, the header up to the
+ “translit_start” line and the tail from the “translit_end”
+ line to the end of the file will be copied unchanged into the
+ output file. ''')
+ PARSER.add_argument(
+ '--unicode_version',
+ nargs='?',
+ required=True,
+ type=str,
+ help='The Unicode version of the input files used.')
+ ARGS = PARSER.parse_args()
+
+ unicode_utils.fill_attributes(ARGS.unicode_data_file)
+ HEAD = TAIL = ''
+ if ARGS.input_file:
+ (HEAD, TAIL) = read_input_file(ARGS.input_file)
+ with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
+ output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
+ output_transliteration(TRANSLIT_FILE)
+ output_tail(TRANSLIT_FILE, tail=TAIL)