Improve utf8_gen.py to set the width for characters with Prepended_Concatenation_Mark property to 1

[BZ #22070] * localedata/unicode-gen/utf8_gen.py: Set the width for characters with Prepended_Concatenation_Mark property to 1 * localedata/charmaps/UTF-8: Updated using the improved script.
author: Mike FABIAN <mfabian@redhat.com> 2017-09-06 11:19:33 +0200
committer: Mike FABIAN <mfabian@redhat.com> 2017-09-06 12:39:49 +0200
commit: 2ae5be041d9ea89cdd0f37734d72051e8f773947 (patch)
tree: c3d52a8b19d8be09ad89b6bbe6b8f0b462b98a48 /localedata/unicode-gen/utf8_gen.py
parent: af83ed5c4647bda196fc1a7efebbe8019aa83f4a (diff)
download: glibc-2ae5be041d9ea89cdd0f37734d72051e8f773947.tar
glibc-2ae5be041d9ea89cdd0f37734d72051e8f773947.tar.gz
glibc-2ae5be041d9ea89cdd0f37734d72051e8f773947.tar.bz2
glibc-2ae5be041d9ea89cdd0f37734d72051e8f773947.zip
1 files changed, 28 insertions, 5 deletions
diff --git a/localedata/unicode-gen/utf8_gen.py b/localedata/unicode-gen/utf8_gen.py
index 52c79e83c1..26939e25a8 100755
--- a/localedata/unicode-gen/utf8_gen.py
+++ b/localedata/unicode-gen/utf8_gen.py
@@ -215,9 +215,11 @@ def write_header_width(outfile):
 #    outfile.write("%   \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
     outfile.write("WIDTH\n")
 
-def process_width(outfile, ulines, elines):
+def process_width(outfile, ulines, elines, plines):
     '''ulines are lines from UnicodeData.txt, elines are lines from
-    EastAsianWidth.txt
+    EastAsianWidth.txt containing characters with width “W” or “F”,
+    plines are lines from PropList.txt which contain characters
+    with the property “Prepended_Concatenation_Mark”.
 
     '''
     width_dict = {}
@@ -230,16 +232,29 @@ def process_width(outfile, ulines, elines):
         for key in range(int(code_points[0], 16),
                          int(code_points[1], 16)+1):
             width_dict[key] = 2
+
     for line in ulines:
         fields = line.split(";")
         if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
             width_dict[int(fields[0], 16)] = 0
 
+    for line in plines:
+        # Characters with the property “Prepended_Concatenation_Mark”
+        # should have the width 1:
+        fields = line.split(";")
+        if not '..' in fields[0]:
+            code_points = (fields[0], fields[0])
+        else:
+            code_points = fields[0].split("..")
+        for key in range(int(code_points[0], 16),
+                         int(code_points[1], 16)+1):
+            del width_dict[key] # default width is 1
+
     # handle special cases for compatibility
     for key in list((0x00AD,)):
         # https://www.cs.tut.fi/~jkorpela/shy.html
         if key in width_dict:
-            del width_dict[key]
+            del width_dict[key] # default width is 1
     for key in list(range(0x1160, 0x1200)):
         width_dict[key] = 0
     for key in list(range(0x3248, 0x3250)):
@@ -278,7 +293,7 @@ def process_width(outfile, ulines, elines):
 
 if __name__ == "__main__":
     if len(sys.argv) < 3:
-        print("USAGE: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt")
+        print("USAGE: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt PropList.txt")
     else:
         with open(sys.argv[1], mode='r') as UNIDATA_FILE:
             UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
@@ -298,6 +313,11 @@ if __name__ == "__main__":
                     continue
                 if re.match(r'^[^;]*;[WF]', LINE):
                     EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
+        with open(sys.argv[3], mode='r') as PROP_LIST_FILE:
+            PROP_LIST_LINES = []
+            for LINE in PROP_LIST_FILE:
+                if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE):
+                    PROP_LIST_LINES.append(LINE.strip())
         with open('UTF-8', mode='w') as OUTFILE:
             # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
             write_header_charmap(OUTFILE)
@@ -305,5 +325,8 @@ if __name__ == "__main__":
             OUTFILE.write("END CHARMAP\n\n")
             # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
             write_header_width(OUTFILE)
-            process_width(OUTFILE, UNICODE_DATA_LINES, EAST_ASIAN_WIDTH_LINES)
+            process_width(OUTFILE,
+                          UNICODE_DATA_LINES,
+                          EAST_ASIAN_WIDTH_LINES,
+                          PROP_LIST_LINES)
             OUTFILE.write("END WIDTH\n")
author	Mike FABIAN <mfabian@redhat.com>	2017-09-06 11:19:33 +0200
committer	Mike FABIAN <mfabian@redhat.com>	2017-09-06 12:39:49 +0200
commit	2ae5be041d9ea89cdd0f37734d72051e8f773947 (patch)
tree	c3d52a8b19d8be09ad89b6bbe6b8f0b462b98a48 /localedata/unicode-gen/utf8_gen.py
parent	af83ed5c4647bda196fc1a7efebbe8019aa83f4a (diff)
download	glibc-2ae5be041d9ea89cdd0f37734d72051e8f773947.tar glibc-2ae5be041d9ea89cdd0f37734d72051e8f773947.tar.gz glibc-2ae5be041d9ea89cdd0f37734d72051e8f773947.tar.bz2 glibc-2ae5be041d9ea89cdd0f37734d72051e8f773947.zip