From e1663b8dc7ca04db26a37d745bfe043df83d3a64 Mon Sep 17 00:00:00 2001 From: Mike FABIAN Date: Fri, 17 Apr 2015 09:12:05 +0200 Subject: [PATCH 3/5] Update the translit files to Unicode 7.0.0 for localedata/ChangeLog [BZ #16061] * unicode_utils.py: New. * existing scripts changed to used unicode_utils.py * gen_translit_circle.py: New * gen_translit_cjk_compat.py: New * gen_translit_combining.py: New * gen_translit_compat.py: New * gen_translit_font.py: New * gen_translit_fraction.py: New * locales/translit_circle: Update. * locales/translit_cjk_compat: Update. * locales/translit_combining: Update. * locales/translit_compat: Update. * locales/translit_font: Update. * locales/translit_fraction: Update. --- localedata/locales/translit_circle | 30 +- localedata/locales/translit_cjk_compat | 422 +++++++++++++- localedata/locales/translit_combining | 636 +++++++++++++++++++++- localedata/locales/translit_compat | 578 +++++++++++++++++++- localedata/locales/translit_font | 151 ++++- localedata/locales/translit_fraction | 15 +- localedata/unicode-gen/Makefile | 42 +- localedata/unicode-gen/gen_translit_circle.py | 149 +++++ localedata/unicode-gen/gen_translit_cjk_compat.py | 219 ++++++++ localedata/unicode-gen/gen_translit_combining.py | 441 +++++++++++++++ localedata/unicode-gen/gen_translit_compat.py | 325 +++++++++++ localedata/unicode-gen/gen_translit_font.py | 155 ++++++ localedata/unicode-gen/gen_translit_fraction.py | 196 +++++++ localedata/unicode-gen/gen_unicode_ctype.py | 497 +---------------- localedata/unicode-gen/unicode_utils.py | 502 +++++++++++++++++ localedata/unicode-gen/utf8_compatibility.py | 217 ++------ localedata/unicode-gen/utf8_gen.py | 28 +- 17 files changed, 3890 insertions(+), 713 deletions(-) create mode 100755 localedata/unicode-gen/gen_translit_circle.py create mode 100755 localedata/unicode-gen/gen_translit_cjk_compat.py create mode 100755 localedata/unicode-gen/gen_translit_combining.py create mode 100755 localedata/unicode-gen/gen_translit_compat.py create mode 100755 localedata/unicode-gen/gen_translit_font.py create mode 100755 localedata/unicode-gen/gen_translit_fraction.py create mode 100644 localedata/unicode-gen/unicode_utils.py diff --git a/localedata/locales/translit_circle b/localedata/locales/translit_circle index f701bc9..5d5f58c 100644 --- a/localedata/locales/translit_circle +++ b/localedata/locales/translit_circle @@ -2,9 +2,7 @@ escape_char / comment_char % % Transliterations of encircled characters. -% Generated through -% $ grep '^[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;' UnicodeData.txt | \ -% sed -e 's/^\([^;]*\);\([^;]*\);[^;]*;[^;]*;[^;]*; \([^;]*\);.*$/ ""% \2/' -e 'h' -e 's/^\([^%]*\)% .*$/\1/' -e 's/\([0-9A-F]\) \([0-9A-F]\)/\1> "" % CIRCLED DIGIT ZERO "" +% CIRCLED IDEOGRAPH QUESTION + "" +% CIRCLED IDEOGRAPH KINDERGARTEN + "" +% CIRCLED IDEOGRAPH SCHOOL + "" +% CIRCLED IDEOGRAPH KOTO + "" % CIRCLED NUMBER TWENTY ONE "" % CIRCLED NUMBER TWENTY TWO @@ -242,6 +248,12 @@ translit_start "" % CIRCLED HANGUL HIEUH A "" +% CIRCLED KOREAN CHARACTER CHAMKO + "" +% CIRCLED KOREAN CHARACTER JUEUI + "" +% CIRCLED HANGUL IEUNG U + "" % CIRCLED IDEOGRAPH ONE "" % CIRCLED IDEOGRAPH TWO @@ -464,6 +476,18 @@ translit_start "" % CIRCLED KATAKANA WO "" +% CIRCLED ITALIC LATIN CAPITAL LETTER C + "" +% CIRCLED ITALIC LATIN CAPITAL LETTER R + "" +% CIRCLED CD + "" +% CIRCLED WZ + "" +% CIRCLED IDEOGRAPH ADVANTAGE + "" +% CIRCLED IDEOGRAPH ACCEPT + "" translit_end diff --git a/localedata/locales/translit_cjk_compat b/localedata/locales/translit_cjk_compat index c73e5e3..a20c6ca 100644 --- a/localedata/locales/translit_cjk_compat +++ b/localedata/locales/translit_cjk_compat @@ -2,18 +2,22 @@ escape_char / comment_char % % Transliterations of CJK compatibility characters. -% Generated through -% $ grep '^[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;' UnicodeData.txt | \ -% sed -e 's/^\([^;]*\);\([^;]*\);[^;]*;[^;]*;[^;]*; \([^;]*\);.*$/ ""% \2/' | sed -e 'h' -e 's/^\([^%]*\)% .*$/\1/' -e 's/\([0-9A-F]\) \([0-9A-F]\)/\1> % \2/' | sed -e 'h' -e 's/^\([^%]*\)% .*$/\1/' -e 's/\([0-9A-F]\) \([0-9A-F]\)/\1>//g' +% Generated automatically from UnicodeData.txt by gen_translit_cjk_compat.py on 2015-06-10 for Unicode 7.0.0. LC_CTYPE translit_start +% PARTNERSHIP SIGN + "" +% SQUARE HG + "" +% SQUARE ERG + "" +% SQUARE EV + "" +% LIMITED LIABILITY SIGN + "" % SQUARE APAATO "" % SQUARE ARUHUA @@ -202,6 +206,14 @@ translit_start "" % SQUARE PC "" +% SQUARE DM + "" +% SQUARE DM SQUARED + "";"" +% SQUARE DM CUBED + "";"" +% SQUARE IU + "" % SQUARE ERA NAME HEISEI "" % SQUARE ERA NAME SYOUWA @@ -400,6 +412,170 @@ translit_start "" % SQUARE WB "" +% SQUARE V OVER M + "";"" +% SQUARE A OVER M + "";"" +% SQUARE GAL + "" +% SQUARED LATIN CAPITAL LETTER A + +% SQUARED LATIN CAPITAL LETTER B + +% SQUARED LATIN CAPITAL LETTER C + +% SQUARED LATIN CAPITAL LETTER D + +% SQUARED LATIN CAPITAL LETTER E + +% SQUARED LATIN CAPITAL LETTER F + +% SQUARED LATIN CAPITAL LETTER G + +% SQUARED LATIN CAPITAL LETTER H + +% SQUARED LATIN CAPITAL LETTER I + +% SQUARED LATIN CAPITAL LETTER J + +% SQUARED LATIN CAPITAL LETTER K + +% SQUARED LATIN CAPITAL LETTER L + +% SQUARED LATIN CAPITAL LETTER M + +% SQUARED LATIN CAPITAL LETTER N + +% SQUARED LATIN CAPITAL LETTER O + +% SQUARED LATIN CAPITAL LETTER P + +% SQUARED LATIN CAPITAL LETTER Q + +% SQUARED LATIN CAPITAL LETTER R + +% SQUARED LATIN CAPITAL LETTER S + +% SQUARED LATIN CAPITAL LETTER T + +% SQUARED LATIN CAPITAL LETTER U + +% SQUARED LATIN CAPITAL LETTER V + +% SQUARED LATIN CAPITAL LETTER W + +% SQUARED LATIN CAPITAL LETTER X + +% SQUARED LATIN CAPITAL LETTER Y + +% SQUARED LATIN CAPITAL LETTER Z + +% SQUARED HV + "" +% SQUARED MV + "" +% SQUARED SD + "" +% SQUARED SS + "" +% SQUARED PPV + "" +% SQUARED WC + "" +% SQUARE DJ + "" +% SQUARE HIRAGANA HOKA + "" +% SQUARED KATAKANA KOKO + "" +% SQUARED KATAKANA SA + +% SQUARED CJK UNIFIED IDEOGRAPH-624B + +% SQUARED CJK UNIFIED IDEOGRAPH-5B57 + +% SQUARED CJK UNIFIED IDEOGRAPH-53CC + +% SQUARED KATAKANA DE + +% SQUARED CJK UNIFIED IDEOGRAPH-4E8C + +% SQUARED CJK UNIFIED IDEOGRAPH-591A + +% SQUARED CJK UNIFIED IDEOGRAPH-89E3 + +% SQUARED CJK UNIFIED IDEOGRAPH-5929 + +% SQUARED CJK UNIFIED IDEOGRAPH-4EA4 + +% SQUARED CJK UNIFIED IDEOGRAPH-6620 + +% SQUARED CJK UNIFIED IDEOGRAPH-7121 + +% SQUARED CJK UNIFIED IDEOGRAPH-6599 + +% SQUARED CJK UNIFIED IDEOGRAPH-524D + +% SQUARED CJK UNIFIED IDEOGRAPH-5F8C + +% SQUARED CJK UNIFIED IDEOGRAPH-518D + +% SQUARED CJK UNIFIED IDEOGRAPH-65B0 + +% SQUARED CJK UNIFIED IDEOGRAPH-521D + +% SQUARED CJK UNIFIED IDEOGRAPH-7D42 + +% SQUARED CJK UNIFIED IDEOGRAPH-751F + +% SQUARED CJK UNIFIED IDEOGRAPH-8CA9 + +% SQUARED CJK UNIFIED IDEOGRAPH-58F0 + +% SQUARED CJK UNIFIED IDEOGRAPH-5439 + +% SQUARED CJK UNIFIED IDEOGRAPH-6F14 + +% SQUARED CJK UNIFIED IDEOGRAPH-6295 + +% SQUARED CJK UNIFIED IDEOGRAPH-6355 + +% SQUARED CJK UNIFIED IDEOGRAPH-4E00 + +% SQUARED CJK UNIFIED IDEOGRAPH-4E09 + +% SQUARED CJK UNIFIED IDEOGRAPH-904A + +% SQUARED CJK UNIFIED IDEOGRAPH-5DE6 + +% SQUARED CJK UNIFIED IDEOGRAPH-4E2D + +% SQUARED CJK UNIFIED IDEOGRAPH-53F3 + +% SQUARED CJK UNIFIED IDEOGRAPH-6307 + +% SQUARED CJK UNIFIED IDEOGRAPH-8D70 + +% SQUARED CJK UNIFIED IDEOGRAPH-6253 + +% SQUARED CJK UNIFIED IDEOGRAPH-7981 + +% SQUARED CJK UNIFIED IDEOGRAPH-7A7A + +% SQUARED CJK UNIFIED IDEOGRAPH-5408 + +% SQUARED CJK UNIFIED IDEOGRAPH-6E80 + +% SQUARED CJK UNIFIED IDEOGRAPH-6709 + +% SQUARED CJK UNIFIED IDEOGRAPH-6708 + +% SQUARED CJK UNIFIED IDEOGRAPH-7533 + +% SQUARED CJK UNIFIED IDEOGRAPH-5272 + +% SQUARED CJK UNIFIED IDEOGRAPH-55B6 + % CJK COMPATIBILITY IDEOGRAPH-F900 % CJK COMPATIBILITY IDEOGRAPH-F901 @@ -980,6 +1156,10 @@ translit_start % CJK COMPATIBILITY IDEOGRAPH-FA2D +% CJK COMPATIBILITY IDEOGRAPH-FA2E + +% CJK COMPATIBILITY IDEOGRAPH-FA2F + % CJK COMPATIBILITY IDEOGRAPH-FA30 % CJK COMPATIBILITY IDEOGRAPH-FA31 @@ -1098,6 +1278,224 @@ translit_start % CJK COMPATIBILITY IDEOGRAPH-FA6A +% CJK COMPATIBILITY IDEOGRAPH-FA6B + +% CJK COMPATIBILITY IDEOGRAPH-FA6C + +% CJK COMPATIBILITY IDEOGRAPH-FA6D + +% CJK COMPATIBILITY IDEOGRAPH-FA70 + +% CJK COMPATIBILITY IDEOGRAPH-FA71 + +% CJK COMPATIBILITY IDEOGRAPH-FA72 + +% CJK COMPATIBILITY IDEOGRAPH-FA73 + +% CJK COMPATIBILITY IDEOGRAPH-FA74 + +% CJK COMPATIBILITY IDEOGRAPH-FA75 + +% CJK COMPATIBILITY IDEOGRAPH-FA76 + +% CJK COMPATIBILITY IDEOGRAPH-FA77 + +% CJK COMPATIBILITY IDEOGRAPH-FA78 + +% CJK COMPATIBILITY IDEOGRAPH-FA79 + +% CJK COMPATIBILITY IDEOGRAPH-FA7A + +% CJK COMPATIBILITY IDEOGRAPH-FA7B + +% CJK COMPATIBILITY IDEOGRAPH-FA7C + +% CJK COMPATIBILITY IDEOGRAPH-FA7D + +% CJK COMPATIBILITY IDEOGRAPH-FA7E + +% CJK COMPATIBILITY IDEOGRAPH-FA7F + +% CJK COMPATIBILITY IDEOGRAPH-FA80 + +% CJK COMPATIBILITY IDEOGRAPH-FA81 + +% CJK COMPATIBILITY IDEOGRAPH-FA82 + +% CJK COMPATIBILITY IDEOGRAPH-FA83 + +% CJK COMPATIBILITY IDEOGRAPH-FA84 + +% CJK COMPATIBILITY IDEOGRAPH-FA85 + +% CJK COMPATIBILITY IDEOGRAPH-FA86 + +% CJK COMPATIBILITY IDEOGRAPH-FA87 + +% CJK COMPATIBILITY IDEOGRAPH-FA88 + +% CJK COMPATIBILITY IDEOGRAPH-FA89 + +% CJK COMPATIBILITY IDEOGRAPH-FA8A + +% CJK COMPATIBILITY IDEOGRAPH-FA8B + +% CJK COMPATIBILITY IDEOGRAPH-FA8C + +% CJK COMPATIBILITY IDEOGRAPH-FA8D + +% CJK COMPATIBILITY IDEOGRAPH-FA8E + +% CJK COMPATIBILITY IDEOGRAPH-FA8F + +% CJK COMPATIBILITY IDEOGRAPH-FA90 + +% CJK COMPATIBILITY IDEOGRAPH-FA91 + +% CJK COMPATIBILITY IDEOGRAPH-FA92 + +% CJK COMPATIBILITY IDEOGRAPH-FA93 + +% CJK COMPATIBILITY IDEOGRAPH-FA94 + +% CJK COMPATIBILITY IDEOGRAPH-FA95 + +% CJK COMPATIBILITY IDEOGRAPH-FA96 + +% CJK COMPATIBILITY IDEOGRAPH-FA97 + +% CJK COMPATIBILITY IDEOGRAPH-FA98 + +% CJK COMPATIBILITY IDEOGRAPH-FA99 + +% CJK COMPATIBILITY IDEOGRAPH-FA9A + +% CJK COMPATIBILITY IDEOGRAPH-FA9B + +% CJK COMPATIBILITY IDEOGRAPH-FA9C + +% CJK COMPATIBILITY IDEOGRAPH-FA9D + +% CJK COMPATIBILITY IDEOGRAPH-FA9E + +% CJK COMPATIBILITY IDEOGRAPH-FA9F + +% CJK COMPATIBILITY IDEOGRAPH-FAA0 + +% CJK COMPATIBILITY IDEOGRAPH-FAA1 + +% CJK COMPATIBILITY IDEOGRAPH-FAA2 + +% CJK COMPATIBILITY IDEOGRAPH-FAA3 + +% CJK COMPATIBILITY IDEOGRAPH-FAA4 + +% CJK COMPATIBILITY IDEOGRAPH-FAA5 + +% CJK COMPATIBILITY IDEOGRAPH-FAA6 + +% CJK COMPATIBILITY IDEOGRAPH-FAA7 + +% CJK COMPATIBILITY IDEOGRAPH-FAA8 + +% CJK COMPATIBILITY IDEOGRAPH-FAA9 + +% CJK COMPATIBILITY IDEOGRAPH-FAAA + +% CJK COMPATIBILITY IDEOGRAPH-FAAB + +% CJK COMPATIBILITY IDEOGRAPH-FAAC + +% CJK COMPATIBILITY IDEOGRAPH-FAAD + +% CJK COMPATIBILITY IDEOGRAPH-FAAE + +% CJK COMPATIBILITY IDEOGRAPH-FAAF + +% CJK COMPATIBILITY IDEOGRAPH-FAB0 + +% CJK COMPATIBILITY IDEOGRAPH-FAB1 + +% CJK COMPATIBILITY IDEOGRAPH-FAB2 + +% CJK COMPATIBILITY IDEOGRAPH-FAB3 + +% CJK COMPATIBILITY IDEOGRAPH-FAB4 + +% CJK COMPATIBILITY IDEOGRAPH-FAB5 + +% CJK COMPATIBILITY IDEOGRAPH-FAB6 + +% CJK COMPATIBILITY IDEOGRAPH-FAB7 + +% CJK COMPATIBILITY IDEOGRAPH-FAB8 + +% CJK COMPATIBILITY IDEOGRAPH-FAB9 + +% CJK COMPATIBILITY IDEOGRAPH-FABA + +% CJK COMPATIBILITY IDEOGRAPH-FABB + +% CJK COMPATIBILITY IDEOGRAPH-FABC + +% CJK COMPATIBILITY IDEOGRAPH-FABD + +% CJK COMPATIBILITY IDEOGRAPH-FABE + +% CJK COMPATIBILITY IDEOGRAPH-FABF + +% CJK COMPATIBILITY IDEOGRAPH-FAC0 + +% CJK COMPATIBILITY IDEOGRAPH-FAC1 + +% CJK COMPATIBILITY IDEOGRAPH-FAC2 + +% CJK COMPATIBILITY IDEOGRAPH-FAC3 + +% CJK COMPATIBILITY IDEOGRAPH-FAC4 + +% CJK COMPATIBILITY IDEOGRAPH-FAC5 + +% CJK COMPATIBILITY IDEOGRAPH-FAC6 + +% CJK COMPATIBILITY IDEOGRAPH-FAC7 + +% CJK COMPATIBILITY IDEOGRAPH-FAC8 + +% CJK COMPATIBILITY IDEOGRAPH-FAC9 + +% CJK COMPATIBILITY IDEOGRAPH-FACA + +% CJK COMPATIBILITY IDEOGRAPH-FACB + +% CJK COMPATIBILITY IDEOGRAPH-FACC + +% CJK COMPATIBILITY IDEOGRAPH-FACD + +% CJK COMPATIBILITY IDEOGRAPH-FACE + +% CJK COMPATIBILITY IDEOGRAPH-FACF + +% CJK COMPATIBILITY IDEOGRAPH-FAD0 + +% CJK COMPATIBILITY IDEOGRAPH-FAD1 + +% CJK COMPATIBILITY IDEOGRAPH-FAD2 + +% CJK COMPATIBILITY IDEOGRAPH-FAD3 + +% CJK COMPATIBILITY IDEOGRAPH-FAD4 + +% CJK COMPATIBILITY IDEOGRAPH-FAD5 + +% CJK COMPATIBILITY IDEOGRAPH-FAD6 + +% CJK COMPATIBILITY IDEOGRAPH-FAD7 + +% CJK COMPATIBILITY IDEOGRAPH-FAD8 + +% CJK COMPATIBILITY IDEOGRAPH-FAD9 + % CJK COMPATIBILITY IDEOGRAPH-2F800 % CJK COMPATIBILITY IDEOGRAPH-2F801 @@ -1307,7 +1705,7 @@ translit_start % CJK COMPATIBILITY IDEOGRAPH-2F867 % CJK COMPATIBILITY IDEOGRAPH-2F868 - + % CJK COMPATIBILITY IDEOGRAPH-2F869 % CJK COMPATIBILITY IDEOGRAPH-2F86A @@ -1331,7 +1729,7 @@ translit_start % CJK COMPATIBILITY IDEOGRAPH-2F873 % CJK COMPATIBILITY IDEOGRAPH-2F874 - + % CJK COMPATIBILITY IDEOGRAPH-2F875 % CJK COMPATIBILITY IDEOGRAPH-2F876 @@ -1673,7 +2071,7 @@ translit_start % CJK COMPATIBILITY IDEOGRAPH-2F91E % CJK COMPATIBILITY IDEOGRAPH-2F91F - + % CJK COMPATIBILITY IDEOGRAPH-2F920 % CJK COMPATIBILITY IDEOGRAPH-2F921 @@ -1801,7 +2199,7 @@ translit_start % CJK COMPATIBILITY IDEOGRAPH-2F95E % CJK COMPATIBILITY IDEOGRAPH-2F95F - + % CJK COMPATIBILITY IDEOGRAPH-2F960 % CJK COMPATIBILITY IDEOGRAPH-2F961 @@ -1993,7 +2391,7 @@ translit_start % CJK COMPATIBILITY IDEOGRAPH-2F9BE % CJK COMPATIBILITY IDEOGRAPH-2F9BF - + % CJK COMPATIBILITY IDEOGRAPH-2F9C0 % CJK COMPATIBILITY IDEOGRAPH-2F9C1 diff --git a/localedata/locales/translit_combining b/localedata/locales/translit_combining index 44c62f9..b1b5345 100644 --- a/localedata/locales/translit_combining +++ b/localedata/locales/translit_combining @@ -3,7 +3,7 @@ comment_char % % Transliterations that remove all combining characters (accents, % pronounciation marks, etc.). -% Generated from UnicodeData.txt. +% Generated automatically from UnicodeData.txt by gen_translit_combining.py on 2015-06-10 for Unicode 7.0.0. LC_CTYPE @@ -167,6 +167,40 @@ translit_start "" % COMBINING UPWARDS ARROW BELOW "" +% COMBINING GRAPHEME JOINER + "" +% COMBINING RIGHT ARROWHEAD ABOVE + "" +% COMBINING LEFT HALF RING ABOVE + "" +% COMBINING FERMATA + "" +% COMBINING X BELOW + "" +% COMBINING LEFT ARROWHEAD BELOW + "" +% COMBINING RIGHT ARROWHEAD BELOW + "" +% COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW + "" +% COMBINING RIGHT HALF RING ABOVE + "" +% COMBINING DOT ABOVE RIGHT + "" +% COMBINING ASTERISK BELOW + "" +% COMBINING DOUBLE RING BELOW + "" +% COMBINING ZIGZAG ABOVE + "" +% COMBINING DOUBLE BREVE BELOW + "" +% COMBINING DOUBLE BREVE + "" +% COMBINING DOUBLE MACRON + "" +% COMBINING DOUBLE MACRON BELOW + "" % COMBINING DOUBLE TILDE "" % COMBINING DOUBLE INVERTED BREVE @@ -199,6 +233,68 @@ translit_start "" % COMBINING LATIN SMALL LETTER X "" +% HEBREW ACCENT ETNAHTA + "" +% HEBREW ACCENT SEGOL + "" +% HEBREW ACCENT SHALSHELET + "" +% HEBREW ACCENT ZAQEF QATAN + "" +% HEBREW ACCENT ZAQEF GADOL + "" +% HEBREW ACCENT TIPEHA + "" +% HEBREW ACCENT REVIA + "" +% HEBREW ACCENT ZARQA + "" +% HEBREW ACCENT PASHTA + "" +% HEBREW ACCENT YETIV + "" +% HEBREW ACCENT TEVIR + "" +% HEBREW ACCENT GERESH + "" +% HEBREW ACCENT GERESH MUQDAM + "" +% HEBREW ACCENT GERSHAYIM + "" +% HEBREW ACCENT QARNEY PARA + "" +% HEBREW ACCENT TELISHA GEDOLA + "" +% HEBREW ACCENT PAZER + "" +% HEBREW ACCENT ATNAH HAFUKH + "" +% HEBREW ACCENT MUNAH + "" +% HEBREW ACCENT MAHAPAKH + "" +% HEBREW ACCENT MERKHA + "" +% HEBREW ACCENT MERKHA KEFULA + "" +% HEBREW ACCENT DARGA + "" +% HEBREW ACCENT QADMA + "" +% HEBREW ACCENT TELISHA QETANA + "" +% HEBREW ACCENT YERAH BEN YOMO + "" +% HEBREW ACCENT OLE + "" +% HEBREW ACCENT ILUY + "" +% HEBREW ACCENT DEHI + "" +% HEBREW ACCENT ZINOR + "" +% HEBREW MARK MASORA CIRCLE + "" % HEBREW POINT SHEVA "" % HEBREW POINT HATAF SEGOL @@ -219,6 +315,8 @@ translit_start "" % HEBREW POINT HOLAM "" +% HEBREW POINT HOLAM HASER FOR VAV + "" % HEBREW POINT QUBUTS "" % HEBREW POINT DAGESH OR MAPIQ @@ -231,12 +329,358 @@ translit_start "" % HEBREW POINT SIN DOT "" +% HEBREW MARK UPPER DOT + "" +% HEBREW MARK LOWER DOT + "" +% HEBREW POINT QAMATS QATAN + "" +% ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM + "" +% ARABIC SIGN ALAYHE ASSALLAM + "" +% ARABIC SIGN RAHMATULLAH ALAYHE + "" +% ARABIC SIGN RADI ALLAHOU ANHU + "" +% ARABIC SIGN TAKHALLUS + "" +% ARABIC SMALL HIGH TAH + "" +% ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH + "" +% ARABIC SMALL HIGH ZAIN + "" +% ARABIC SMALL FATHA + "" +% ARABIC SMALL DAMMA + "" +% ARABIC SMALL KASRA + "" +% ARABIC FATHATAN + "" +% ARABIC DAMMATAN + "" +% ARABIC KASRATAN + "" +% ARABIC FATHA + "" +% ARABIC DAMMA + "" +% ARABIC KASRA + "" +% ARABIC SHADDA + "" +% ARABIC SUKUN + "" % ARABIC MADDAH ABOVE "" % ARABIC HAMZA ABOVE "" % ARABIC HAMZA BELOW "" +% ARABIC SUBSCRIPT ALEF + "" +% ARABIC INVERTED DAMMA + "" +% ARABIC MARK NOON GHUNNA + "" +% ARABIC ZWARAKAY + "" +% ARABIC VOWEL SIGN SMALL V ABOVE + "" +% ARABIC VOWEL SIGN INVERTED SMALL V ABOVE + "" +% ARABIC VOWEL SIGN DOT BELOW + "" +% ARABIC REVERSED DAMMA + "" +% ARABIC FATHA WITH TWO DOTS + "" +% ARABIC WAVY HAMZA BELOW + "" +% ARABIC LETTER SUPERSCRIPT ALEF + "" +% ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA + "" +% ARABIC SMALL HIGH LIGATURE QAF WITH LAM WITH ALEF MAKSURA + "" +% ARABIC SMALL HIGH MEEM INITIAL FORM + "" +% ARABIC SMALL HIGH LAM ALEF + "" +% ARABIC SMALL HIGH JEEM + "" +% ARABIC SMALL HIGH THREE DOTS + "" +% ARABIC SMALL HIGH SEEN + "" +% ARABIC SMALL HIGH ROUNDED ZERO + "" +% ARABIC SMALL HIGH UPRIGHT RECTANGULAR ZERO + "" +% ARABIC SMALL HIGH DOTLESS HEAD OF KHAH + "" +% ARABIC SMALL HIGH MEEM ISOLATED FORM + "" +% ARABIC SMALL LOW SEEN + "" +% ARABIC SMALL HIGH MADDA + "" +% ARABIC SMALL HIGH YEH + "" +% ARABIC SMALL HIGH NOON + "" +% ARABIC EMPTY CENTRE LOW STOP + "" +% ARABIC EMPTY CENTRE HIGH STOP + "" +% ARABIC ROUNDED HIGH STOP WITH FILLED CENTRE + "" +% ARABIC SMALL LOW MEEM + "" +% ARABIC CURLY FATHA + "" +% ARABIC CURLY DAMMA + "" +% ARABIC CURLY KASRA + "" +% ARABIC CURLY FATHATAN + "" +% ARABIC CURLY DAMMATAN + "" +% ARABIC CURLY KASRATAN + "" +% ARABIC TONE ONE DOT ABOVE + "" +% ARABIC TONE TWO DOTS ABOVE + "" +% ARABIC TONE LOOP ABOVE + "" +% ARABIC TONE ONE DOT BELOW + "" +% ARABIC TONE TWO DOTS BELOW + "" +% ARABIC TONE LOOP BELOW + "" +% ARABIC OPEN FATHATAN + "" +% ARABIC OPEN DAMMATAN + "" +% ARABIC OPEN KASRATAN + "" +% ARABIC SMALL HIGH WAW + "" +% ARABIC FATHA WITH RING + "" +% ARABIC FATHA WITH DOT ABOVE + "" +% ARABIC KASRA WITH DOT BELOW + "" +% ARABIC LEFT ARROWHEAD ABOVE + "" +% ARABIC RIGHT ARROWHEAD ABOVE + "" +% ARABIC LEFT ARROWHEAD BELOW + "" +% ARABIC RIGHT ARROWHEAD BELOW + "" +% ARABIC DOUBLE RIGHT ARROWHEAD ABOVE + "" +% ARABIC DOUBLE RIGHT ARROWHEAD ABOVE WITH DOT + "" +% ARABIC RIGHT ARROWHEAD ABOVE WITH DOT + "" +% ARABIC DAMMA WITH DOT + "" +% ARABIC MARK SIDEWAYS NOON GHUNNA + "" +% COMBINING DOUBLED CIRCUMFLEX ACCENT + "" +% COMBINING DIAERESIS-RING + "" +% COMBINING INFINITY + "" +% COMBINING DOWNWARDS ARROW + "" +% COMBINING TRIPLE DOT + "" +% COMBINING X-X BELOW + "" +% COMBINING WIGGLY LINE BELOW + "" +% COMBINING OPEN MARK BELOW + "" +% COMBINING DOUBLE OPEN MARK BELOW + "" +% COMBINING LIGHT CENTRALIZATION STROKE BELOW + "" +% COMBINING STRONG CENTRALIZATION STROKE BELOW + "" +% COMBINING PARENTHESES ABOVE + "" +% COMBINING DOUBLE PARENTHESES ABOVE + "" +% COMBINING PARENTHESES BELOW + "" +% COMBINING PARENTHESES OVERLAY + "" +% COMBINING DOTTED GRAVE ACCENT + "" +% COMBINING DOTTED ACUTE ACCENT + "" +% COMBINING SNAKE BELOW + "" +% COMBINING SUSPENSION MARK + "" +% COMBINING MACRON-ACUTE + "" +% COMBINING GRAVE-MACRON + "" +% COMBINING MACRON-GRAVE + "" +% COMBINING ACUTE-MACRON + "" +% COMBINING GRAVE-ACUTE-GRAVE + "" +% COMBINING ACUTE-GRAVE-ACUTE + "" +% COMBINING LATIN SMALL LETTER R BELOW + "" +% COMBINING BREVE-MACRON + "" +% COMBINING MACRON-BREVE + "" +% COMBINING DOUBLE CIRCUMFLEX ABOVE + "" +% COMBINING OGONEK ABOVE + "" +% COMBINING ZIGZAG BELOW + "" +% COMBINING IS BELOW + "" +% COMBINING UR ABOVE + "" +% COMBINING US ABOVE + "" +% COMBINING LATIN SMALL LETTER FLATTENED OPEN A ABOVE + "" +% COMBINING LATIN SMALL LETTER AE + "" +% COMBINING LATIN SMALL LETTER AO + "" +% COMBINING LATIN SMALL LETTER AV + "" +% COMBINING LATIN SMALL LETTER C CEDILLA + "" +% COMBINING LATIN SMALL LETTER INSULAR D + "" +% COMBINING LATIN SMALL LETTER ETH + "" +% COMBINING LATIN SMALL LETTER G + "" +% COMBINING LATIN LETTER SMALL CAPITAL G + "" +% COMBINING LATIN SMALL LETTER K + "" +% COMBINING LATIN SMALL LETTER L + "" +% COMBINING LATIN LETTER SMALL CAPITAL L + "" +% COMBINING LATIN LETTER SMALL CAPITAL M + "" +% COMBINING LATIN SMALL LETTER N + "" +% COMBINING LATIN LETTER SMALL CAPITAL N + "" +% COMBINING LATIN LETTER SMALL CAPITAL R + "" +% COMBINING LATIN SMALL LETTER R ROTUNDA + "" +% COMBINING LATIN SMALL LETTER S + "" +% COMBINING LATIN SMALL LETTER LONG S + "" +% COMBINING LATIN SMALL LETTER Z + "" +% COMBINING LATIN SMALL LETTER ALPHA + "" +% COMBINING LATIN SMALL LETTER B + "" +% COMBINING LATIN SMALL LETTER BETA + "" +% COMBINING LATIN SMALL LETTER SCHWA + "" +% COMBINING LATIN SMALL LETTER F + "" +% COMBINING LATIN SMALL LETTER L WITH DOUBLE MIDDLE TILDE + "" +% COMBINING LATIN SMALL LETTER O WITH LIGHT CENTRALIZATION STROKE + "" +% COMBINING LATIN SMALL LETTER P + "" +% COMBINING LATIN SMALL LETTER ESH + "" +% COMBINING LATIN SMALL LETTER U WITH LIGHT CENTRALIZATION STROKE + "" +% COMBINING LATIN SMALL LETTER W + "" +% COMBINING LATIN SMALL LETTER A WITH DIAERESIS + "" +% COMBINING LATIN SMALL LETTER O WITH DIAERESIS + "" +% COMBINING LATIN SMALL LETTER U WITH DIAERESIS + "" +% COMBINING UP TACK ABOVE + "" +% COMBINING DOUBLE INVERTED BREVE BELOW + "" +% COMBINING ALMOST EQUAL TO BELOW + "" +% COMBINING LEFT ARROWHEAD ABOVE + "" +% COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW + "" +% COMBINING LEFT HARPOON ABOVE + "" +% COMBINING RIGHT HARPOON ABOVE + "" +% COMBINING LONG VERTICAL LINE OVERLAY + "" +% COMBINING SHORT VERTICAL LINE OVERLAY + "" +% COMBINING ANTICLOCKWISE ARROW ABOVE + "" +% COMBINING CLOCKWISE ARROW ABOVE + "" +% COMBINING LEFT ARROW ABOVE + "" +% COMBINING RIGHT ARROW ABOVE + "" +% COMBINING RING OVERLAY + "" +% COMBINING CLOCKWISE RING OVERLAY + "" +% COMBINING ANTICLOCKWISE RING OVERLAY + "" +% COMBINING THREE DOTS ABOVE + "" +% COMBINING FOUR DOTS ABOVE + "" +% COMBINING ENCLOSING CIRCLE + "" +% COMBINING ENCLOSING SQUARE + "" +% COMBINING ENCLOSING DIAMOND + "" +% COMBINING ENCLOSING CIRCLE BACKSLASH + "" +% COMBINING LEFT RIGHT ARROW ABOVE + "" +% COMBINING ENCLOSING SCREEN + "" +% COMBINING ENCLOSING KEYCAP + "" % COMBINING ENCLOSING UPWARD POINTING TRIANGLE "" % COMBINING REVERSE SOLIDUS OVERLAY @@ -251,10 +695,70 @@ translit_start "" % COMBINING LEFTWARDS ARROW OVERLAY "" +% COMBINING LONG DOUBLE SOLIDUS OVERLAY + "" +% COMBINING RIGHTWARDS HARPOON WITH BARB DOWNWARDS + "" +% COMBINING LEFTWARDS HARPOON WITH BARB DOWNWARDS + "" +% COMBINING LEFT ARROW BELOW + "" +% COMBINING RIGHT ARROW BELOW + "" +% COMBINING ASTERISK ABOVE + "" % COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK "" % COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK "" +% HEBREW POINT JUDEO-SPANISH VARIKA + "" +% COMBINING LIGATURE LEFT HALF + "" +% COMBINING LIGATURE RIGHT HALF + "" +% COMBINING DOUBLE TILDE LEFT HALF + "" +% COMBINING DOUBLE TILDE RIGHT HALF + "" +% COMBINING MACRON LEFT HALF + "" +% COMBINING MACRON RIGHT HALF + "" +% COMBINING CONJOINING MACRON + "" +% COMBINING LIGATURE LEFT HALF BELOW + "" +% COMBINING LIGATURE RIGHT HALF BELOW + "" +% COMBINING TILDE LEFT HALF BELOW + "" +% COMBINING TILDE RIGHT HALF BELOW + "" +% COMBINING MACRON LEFT HALF BELOW + "" +% COMBINING MACRON RIGHT HALF BELOW + "" +% COMBINING CONJOINING MACRON BELOW + "" +% PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE + "" +% COMBINING OLD PERMIC LETTER AN + "" +% COMBINING OLD PERMIC LETTER DOI + "" +% COMBINING OLD PERMIC LETTER ZATA + "" +% COMBINING OLD PERMIC LETTER NENOE + "" +% COMBINING OLD PERMIC LETTER SII + "" +% COMBINING GREEK MUSICAL TRISEME + "" +% COMBINING GREEK MUSICAL TETRASEME + "" +% COMBINING GREEK MUSICAL PENTASEME + "" % LATIN CAPITAL LETTER A WITH GRAVE @@ -268,6 +772,8 @@ translit_start % LATIN CAPITAL LETTER A WITH RING ABOVE +% LATIN CAPITAL LETTER AE + "" % LATIN CAPITAL LETTER C WITH CEDILLA % LATIN CAPITAL LETTER E WITH GRAVE @@ -298,6 +804,8 @@ translit_start % LATIN CAPITAL LETTER O WITH DIAERESIS +% LATIN CAPITAL LETTER O WITH STROKE + % LATIN CAPITAL LETTER U WITH GRAVE % LATIN CAPITAL LETTER U WITH ACUTE @@ -320,6 +828,8 @@ translit_start % LATIN SMALL LETTER A WITH RING ABOVE +% LATIN SMALL LETTER AE + "" % LATIN SMALL LETTER C WITH CEDILLA % LATIN SMALL LETTER E WITH GRAVE @@ -350,6 +860,8 @@ translit_start % LATIN SMALL LETTER O WITH DIAERESIS +% LATIN SMALL LETTER O WITH STROKE + % LATIN SMALL LETTER U WITH GRAVE % LATIN SMALL LETTER U WITH ACUTE @@ -472,10 +984,6 @@ translit_start % LATIN SMALL LETTER L WITH CARON -% LATIN CAPITAL LETTER L WITH STROKE - -% LATIN SMALL LETTER L WITH STROKE - % LATIN CAPITAL LETTER N WITH ACUTE % LATIN SMALL LETTER N WITH ACUTE @@ -673,9 +1181,9 @@ translit_start % LATIN SMALL LETTER AE WITH ACUTE ;"" % LATIN CAPITAL LETTER O WITH STROKE AND ACUTE - + ; % LATIN SMALL LETTER O WITH STROKE AND ACUTE - + ; % LATIN CAPITAL LETTER A WITH DOUBLE GRAVE % LATIN SMALL LETTER A WITH DOUBLE GRAVE @@ -764,14 +1272,6 @@ translit_start % LATIN SMALL LETTER Y WITH MACRON -% COMBINING GRAVE TONE MARK - -% COMBINING ACUTE TONE MARK - -% COMBINING GREEK KORONIS - -% COMBINING GREEK DIALYTIKA TONOS - % GREEK NUMERAL SIGN % GREEK QUESTION MARK @@ -928,6 +1428,8 @@ translit_start % CYRILLIC SMALL LETTER YERU WITH DIAERESIS +% HEBREW LIGATURE YIDDISH DOUBLE YOD + "" % ARABIC LETTER ALEF WITH MADDA ABOVE % ARABIC LETTER ALEF WITH HAMZA ABOVE @@ -1017,7 +1519,7 @@ translit_start % KANNADA VOWEL SIGN O "" % KANNADA VOWEL SIGN OO - "" + "" % MALAYALAM VOWEL SIGN O "" % MALAYALAM VOWEL SIGN OO @@ -1029,7 +1531,7 @@ translit_start % SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA "" % SINHALA VOWEL SIGN KOMBUVA HAA DIGA AELA-PILLA - "" + "" % SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA "" % TIBETAN LETTER GHA @@ -2020,16 +2522,114 @@ translit_start ; % EM QUAD ; +% EN SPACE + +% EM SPACE + % OHM SIGN % KELVIN SIGN % ANGSTROM SIGN - + +% LEFTWARDS ARROW WITH STROKE + +% RIGHTWARDS ARROW WITH STROKE + +% LEFT RIGHT ARROW WITH STROKE + "" +% LEFTWARDS DOUBLE ARROW WITH STROKE + "" +% LEFT RIGHT DOUBLE ARROW WITH STROKE + "" +% RIGHTWARDS DOUBLE ARROW WITH STROKE + "" +% THERE DOES NOT EXIST + "" +% NOT AN ELEMENT OF + "" +% DOES NOT CONTAIN AS MEMBER + "" +% DOES NOT DIVIDE + "" +% NOT PARALLEL TO + "" +% NOT TILDE + "" +% NOT ASYMPTOTICALLY EQUAL TO + "" +% NEITHER APPROXIMATELY NOR ACTUALLY EQUAL TO + "" +% NOT ALMOST EQUAL TO + "" +% NOT EQUAL TO + "" +% NOT IDENTICAL TO + "" +% NOT EQUIVALENT TO + "" +% NOT LESS-THAN + "" +% NOT GREATER-THAN + "" +% NEITHER LESS-THAN NOR EQUAL TO + "" +% NEITHER GREATER-THAN NOR EQUAL TO + "" +% NEITHER LESS-THAN NOR EQUIVALENT TO + "" +% NEITHER GREATER-THAN NOR EQUIVALENT TO + "" +% NEITHER LESS-THAN NOR GREATER-THAN + "" +% NEITHER GREATER-THAN NOR LESS-THAN + "" +% DOES NOT PRECEDE + "" +% DOES NOT SUCCEED + "" +% NOT A SUBSET OF + "" +% NOT A SUPERSET OF + "" +% NEITHER A SUBSET OF NOR EQUAL TO + "" +% NEITHER A SUPERSET OF NOR EQUAL TO + "" +% DOES NOT PROVE + "" +% NOT TRUE + "" +% DOES NOT FORCE + "" +% NEGATED DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE + "" +% DOES NOT PRECEDE OR EQUAL + "" +% DOES NOT SUCCEED OR EQUAL + "" +% NOT SQUARE IMAGE OF OR EQUAL TO + "" +% NOT SQUARE ORIGINAL OF OR EQUAL TO + "" +% NOT NORMAL SUBGROUP OF + "" +% DOES NOT CONTAIN AS NORMAL SUBGROUP + "" +% NOT NORMAL SUBGROUP OF OR EQUAL TO + "" +% DOES NOT CONTAIN AS NORMAL SUBGROUP OR EQUAL + "" % LEFT-POINTING ANGLE BRACKET ; % RIGHT-POINTING ANGLE BRACKET ; +% FORKING + "" +% LEFT ANGLE BRACKET + +% RIGHT ANGLE BRACKET + % HIRAGANA LETTER GA % HIRAGANA LETTER GI diff --git a/localedata/locales/translit_compat b/localedata/locales/translit_compat index bb9d660..6e45220 100644 --- a/localedata/locales/translit_compat +++ b/localedata/locales/translit_compat @@ -2,18 +2,24 @@ escape_char / comment_char % % Transliterations of compatibility characters and ligatures. -% Generated through -% $ grep '^[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;' UnicodeData.txt | \ -% sed -e 's/^\([^;]*\);\([^;]*\);[^;]*;[^;]*;[^;]*; \([^;]*\);.*$/ ""% \2/' | grep -v '0020 03[0-6][0-9A-F]' | sed -e 'h' -e 's/^\([^%]*\)% .*$/\1/' -e 's/\([0-9A-F]\) \([0-9A-F]\)/\1> "" +% SUPERSCRIPT TWO + "" +% SUPERSCRIPT THREE + "" % MICRO SIGN - ""; + "";"" +% SUPERSCRIPT ONE + "" +% MASCULINE ORDINAL INDICATOR + "" % LATIN CAPITAL LIGATURE IJ "" % LATIN SMALL LIGATURE IJ @@ -54,6 +60,38 @@ translit_start "" % LATIN SMALL LETTER DZ "" +% MODIFIER LETTER SMALL H + "" +% MODIFIER LETTER SMALL H WITH HOOK + "" +% MODIFIER LETTER SMALL J + "" +% MODIFIER LETTER SMALL R + "" +% MODIFIER LETTER SMALL TURNED R + "" +% MODIFIER LETTER SMALL TURNED R WITH HOOK + "" +% MODIFIER LETTER SMALL CAPITAL INVERTED R + "" +% MODIFIER LETTER SMALL W + "" +% MODIFIER LETTER SMALL Y + "" +% MODIFIER LETTER APOSTROPHE + "" +% MODIFIER LETTER SMALL GAMMA + "" +% MODIFIER LETTER SMALL L + "" +% MODIFIER LETTER SMALL S + "" +% MODIFIER LETTER SMALL X + "" +% MODIFIER LETTER SMALL REVERSED GLOTTAL STOP + "" +% GREEK SMALL LETTER MU + "" % GREEK BETA SYMBOL "" % GREEK THETA SYMBOL @@ -74,6 +112,20 @@ translit_start "" % GREEK LUNATE EPSILON SYMBOL "" +% GREEK CAPITAL LUNATE SIGMA SYMBOL + "" +% CYRILLIC CAPITAL LIGATURE EN GHE + "" +% CYRILLIC SMALL LIGATURE EN GHE + "" +% CYRILLIC CAPITAL LIGATURE TE TSE + "" +% CYRILLIC SMALL LIGATURE TE TSE + "" +% CYRILLIC CAPITAL LIGATURE A IE + "" +% CYRILLIC SMALL LIGATURE A IE + "" % ARMENIAN SMALL LIGATURE ECH YIWN "" % HEBREW LIGATURE YIDDISH DOUBLE VAV @@ -102,6 +154,204 @@ translit_start "" % TIBETAN VOWEL SIGN VOCALIC LL "" +% MODIFIER LETTER GEORGIAN NAR + "" +% MODIFIER LETTER CAPITAL A + "" +% MODIFIER LETTER CAPITAL AE + "" +% MODIFIER LETTER CAPITAL B + "" +% MODIFIER LETTER CAPITAL D + "" +% MODIFIER LETTER CAPITAL E + "" +% MODIFIER LETTER CAPITAL REVERSED E + "" +% MODIFIER LETTER CAPITAL G + "" +% MODIFIER LETTER CAPITAL H + "" +% MODIFIER LETTER CAPITAL I + "" +% MODIFIER LETTER CAPITAL J + "" +% MODIFIER LETTER CAPITAL K + "" +% MODIFIER LETTER CAPITAL L + "" +% MODIFIER LETTER CAPITAL M + "" +% MODIFIER LETTER CAPITAL N + "" +% MODIFIER LETTER CAPITAL O + "" +% MODIFIER LETTER CAPITAL OU + "" +% MODIFIER LETTER CAPITAL P + "" +% MODIFIER LETTER CAPITAL R + "" +% MODIFIER LETTER CAPITAL T + "" +% MODIFIER LETTER CAPITAL U + "" +% MODIFIER LETTER CAPITAL W + "" +% MODIFIER LETTER SMALL A + "" +% MODIFIER LETTER SMALL TURNED A + "" +% MODIFIER LETTER SMALL ALPHA + "" +% MODIFIER LETTER SMALL TURNED AE + "" +% MODIFIER LETTER SMALL B + "" +% MODIFIER LETTER SMALL D + "" +% MODIFIER LETTER SMALL E + "" +% MODIFIER LETTER SMALL SCHWA + "" +% MODIFIER LETTER SMALL OPEN E + "" +% MODIFIER LETTER SMALL TURNED OPEN E + "" +% MODIFIER LETTER SMALL G + "" +% MODIFIER LETTER SMALL K + "" +% MODIFIER LETTER SMALL M + "" +% MODIFIER LETTER SMALL ENG + "" +% MODIFIER LETTER SMALL O + "" +% MODIFIER LETTER SMALL OPEN O + "" +% MODIFIER LETTER SMALL TOP HALF O + "" +% MODIFIER LETTER SMALL BOTTOM HALF O + "" +% MODIFIER LETTER SMALL P + "" +% MODIFIER LETTER SMALL T + "" +% MODIFIER LETTER SMALL U + "" +% MODIFIER LETTER SMALL SIDEWAYS U + "" +% MODIFIER LETTER SMALL TURNED M + "" +% MODIFIER LETTER SMALL V + "" +% MODIFIER LETTER SMALL AIN + "" +% MODIFIER LETTER SMALL BETA + "" +% MODIFIER LETTER SMALL GREEK GAMMA + "" +% MODIFIER LETTER SMALL DELTA + "" +% MODIFIER LETTER SMALL GREEK PHI + "" +% MODIFIER LETTER SMALL CHI + "" +% LATIN SUBSCRIPT SMALL LETTER I + "" +% LATIN SUBSCRIPT SMALL LETTER R + "" +% LATIN SUBSCRIPT SMALL LETTER U + "" +% LATIN SUBSCRIPT SMALL LETTER V + "" +% GREEK SUBSCRIPT SMALL LETTER BETA + "" +% GREEK SUBSCRIPT SMALL LETTER GAMMA + "" +% GREEK SUBSCRIPT SMALL LETTER RHO + "" +% GREEK SUBSCRIPT SMALL LETTER PHI + "" +% GREEK SUBSCRIPT SMALL LETTER CHI + "" +% MODIFIER LETTER CYRILLIC EN + "" +% MODIFIER LETTER SMALL TURNED ALPHA + "" +% MODIFIER LETTER SMALL C + "" +% MODIFIER LETTER SMALL C WITH CURL + "" +% MODIFIER LETTER SMALL ETH + "" +% MODIFIER LETTER SMALL REVERSED OPEN E + "" +% MODIFIER LETTER SMALL F + "" +% MODIFIER LETTER SMALL DOTLESS J WITH STROKE + "" +% MODIFIER LETTER SMALL SCRIPT G + "" +% MODIFIER LETTER SMALL TURNED H + "" +% MODIFIER LETTER SMALL I WITH STROKE + "" +% MODIFIER LETTER SMALL IOTA + "" +% MODIFIER LETTER SMALL CAPITAL I + "" +% MODIFIER LETTER SMALL CAPITAL I WITH STROKE + "" +% MODIFIER LETTER SMALL J WITH CROSSED-TAIL + "" +% MODIFIER LETTER SMALL L WITH RETROFLEX HOOK + "" +% MODIFIER LETTER SMALL L WITH PALATAL HOOK + "" +% MODIFIER LETTER SMALL CAPITAL L + "" +% MODIFIER LETTER SMALL M WITH HOOK + "" +% MODIFIER LETTER SMALL TURNED M WITH LONG LEG + "" +% MODIFIER LETTER SMALL N WITH LEFT HOOK + "" +% MODIFIER LETTER SMALL N WITH RETROFLEX HOOK + "" +% MODIFIER LETTER SMALL CAPITAL N + "" +% MODIFIER LETTER SMALL BARRED O + "" +% MODIFIER LETTER SMALL PHI + "" +% MODIFIER LETTER SMALL S WITH HOOK + "" +% MODIFIER LETTER SMALL ESH + "" +% MODIFIER LETTER SMALL T WITH PALATAL HOOK + "" +% MODIFIER LETTER SMALL U BAR + "" +% MODIFIER LETTER SMALL UPSILON + "" +% MODIFIER LETTER SMALL CAPITAL U + "" +% MODIFIER LETTER SMALL V WITH HOOK + "" +% MODIFIER LETTER SMALL TURNED V + "" +% MODIFIER LETTER SMALL Z + "" +% MODIFIER LETTER SMALL Z WITH RETROFLEX HOOK + "" +% MODIFIER LETTER SMALL Z WITH CURL + "" +% MODIFIER LETTER SMALL EZH + "" +% MODIFIER LETTER SMALL THETA + "" % LATIN SMALL LETTER A WITH RIGHT HALF RING "" % EN SPACE @@ -146,6 +396,90 @@ translit_start "" % MEDIUM MATHEMATICAL SPACE "" +% SUPERSCRIPT ZERO + "" +% SUPERSCRIPT LATIN SMALL LETTER I + "" +% SUPERSCRIPT FOUR + "" +% SUPERSCRIPT FIVE + "" +% SUPERSCRIPT SIX + "" +% SUPERSCRIPT SEVEN + "" +% SUPERSCRIPT EIGHT + "" +% SUPERSCRIPT NINE + "" +% SUPERSCRIPT PLUS SIGN + "" +% SUPERSCRIPT MINUS + "" +% SUPERSCRIPT EQUALS SIGN + "" +% SUPERSCRIPT LEFT PARENTHESIS + "" +% SUPERSCRIPT RIGHT PARENTHESIS + "" +% SUPERSCRIPT LATIN SMALL LETTER N + "" +% SUBSCRIPT ZERO + "" +% SUBSCRIPT ONE + "" +% SUBSCRIPT TWO + "" +% SUBSCRIPT THREE + "" +% SUBSCRIPT FOUR + "" +% SUBSCRIPT FIVE + "" +% SUBSCRIPT SIX + "" +% SUBSCRIPT SEVEN + "" +% SUBSCRIPT EIGHT + "" +% SUBSCRIPT NINE + "" +% SUBSCRIPT PLUS SIGN + "" +% SUBSCRIPT MINUS + "" +% SUBSCRIPT EQUALS SIGN + "" +% SUBSCRIPT LEFT PARENTHESIS + "" +% SUBSCRIPT RIGHT PARENTHESIS + "" +% LATIN SUBSCRIPT SMALL LETTER A + "" +% LATIN SUBSCRIPT SMALL LETTER E + "" +% LATIN SUBSCRIPT SMALL LETTER O + "" +% LATIN SUBSCRIPT SMALL LETTER X + "" +% LATIN SUBSCRIPT SMALL LETTER SCHWA + "" +% LATIN SUBSCRIPT SMALL LETTER H + "" +% LATIN SUBSCRIPT SMALL LETTER K + "" +% LATIN SUBSCRIPT SMALL LETTER L + "" +% LATIN SUBSCRIPT SMALL LETTER M + "" +% LATIN SUBSCRIPT SMALL LETTER N + "" +% LATIN SUBSCRIPT SMALL LETTER P + "" +% LATIN SUBSCRIPT SMALL LETTER S + "" +% LATIN SUBSCRIPT SMALL LETTER T + "" % RUPEE SIGN "" % ACCOUNT OF @@ -164,8 +498,12 @@ translit_start "" % NUMERO SIGN "" +% SERVICE MARK + "" % TELEPHONE SIGN "" +% TRADE MARK SIGN + "" % ALEF SYMBOL "" % BET SYMBOL @@ -174,6 +512,8 @@ translit_start "" % DALET SYMBOL "" +% FACSIMILE SIGN + "" % ROMAN NUMERAL ONE "" % ROMAN NUMERAL TWO @@ -386,6 +726,12 @@ translit_start "" % THREE CONSECUTIVE EQUALS SIGNS "" +% LATIN SUBSCRIPT SMALL LETTER J + "" +% MODIFIER LETTER CAPITAL V + "" +% TIFINAGH MODIFIER LETTER LABIALIZATION MARK + "" % CJK RADICAL MOTHER "" % CJK RADICAL C-SIMPLIFIED TURTLE @@ -830,6 +1176,10 @@ translit_start "" % KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK "" +% HIRAGANA DIGRAPH YORI + "" +% KATAKANA DIGRAPH KOTO + "" % HANGUL LETTER KIYEOK "" % HANGUL LETTER SSANGKIYEOK @@ -1018,6 +1368,34 @@ translit_start "" % HANGUL LETTER ARAEAE "" +% IDEOGRAPHIC ANNOTATION ONE MARK + "" +% IDEOGRAPHIC ANNOTATION TWO MARK + "" +% IDEOGRAPHIC ANNOTATION THREE MARK + "" +% IDEOGRAPHIC ANNOTATION FOUR MARK + "" +% IDEOGRAPHIC ANNOTATION TOP MARK + "" +% IDEOGRAPHIC ANNOTATION MIDDLE MARK + "" +% IDEOGRAPHIC ANNOTATION BOTTOM MARK + "" +% IDEOGRAPHIC ANNOTATION FIRST MARK + "" +% IDEOGRAPHIC ANNOTATION SECOND MARK + "" +% IDEOGRAPHIC ANNOTATION THIRD MARK + "" +% IDEOGRAPHIC ANNOTATION FOURTH MARK + "" +% IDEOGRAPHIC ANNOTATION HEAVEN MARK + "" +% IDEOGRAPHIC ANNOTATION EARTH MARK + "" +% IDEOGRAPHIC ANNOTATION MAN MARK + "" % PARENTHESIZED HANGUL KIYEOK "" % PARENTHESIZED HANGUL NIEUN @@ -1076,6 +1454,10 @@ translit_start "" % PARENTHESIZED HANGUL CIEUC U "" +% PARENTHESIZED KOREAN CHARACTER OJEON + "" +% PARENTHESIZED KOREAN CHARACTER O HU + "" % PARENTHESIZED IDEOGRAPH ONE "" % PARENTHESIZED IDEOGRAPH TWO @@ -1284,6 +1666,24 @@ translit_start "" % IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE "" +% MODIFIER LETTER CYRILLIC HARD SIGN + "" +% MODIFIER LETTER CYRILLIC SOFT SIGN + "" +% MODIFIER LETTER US + "" +% MODIFIER LETTER CAPITAL H WITH STROKE + "" +% MODIFIER LETTER SMALL LIGATURE OE + "" +% MODIFIER LETTER SMALL HENG + "" +% MODIFIER LETTER SMALL L WITH INVERTED LAZY S + "" +% MODIFIER LETTER SMALL L WITH MIDDLE TILDE + "" +% MODIFIER LETTER SMALL U WITH LEFT HOOK + "" % LATIN SMALL LIGATURE FF "" % LATIN SMALL LIGATURE FI @@ -1295,7 +1695,7 @@ translit_start % LATIN SMALL LIGATURE FFL "" % LATIN SMALL LIGATURE LONG S T - "" + "" % LATIN SMALL LIGATURE ST "" % ARMENIAN SMALL LIGATURE MEN NOW @@ -1310,6 +1710,72 @@ translit_start "" % HEBREW LIGATURE ALEF LAMED "" +% PRESENTATION FORM FOR VERTICAL COMMA + "" +% PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA + "" +% PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP + "" +% PRESENTATION FORM FOR VERTICAL COLON + "" +% PRESENTATION FORM FOR VERTICAL SEMICOLON + "" +% PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK + "" +% PRESENTATION FORM FOR VERTICAL QUESTION MARK + "" +% PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET + "" +% PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET + "" +% PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS + "" +% PRESENTATION FORM FOR VERTICAL TWO DOT LEADER + "" +% PRESENTATION FORM FOR VERTICAL EM DASH + "" +% PRESENTATION FORM FOR VERTICAL EN DASH + "" +% PRESENTATION FORM FOR VERTICAL LOW LINE + "" +% PRESENTATION FORM FOR VERTICAL WAVY LOW LINE + "" +% PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS + "" +% PRESENTATION FORM FOR VERTICAL RIGHT PARENTHESIS + "" +% PRESENTATION FORM FOR VERTICAL LEFT CURLY BRACKET + "" +% PRESENTATION FORM FOR VERTICAL RIGHT CURLY BRACKET + "" +% PRESENTATION FORM FOR VERTICAL LEFT TORTOISE SHELL BRACKET + "" +% PRESENTATION FORM FOR VERTICAL RIGHT TORTOISE SHELL BRACKET + "" +% PRESENTATION FORM FOR VERTICAL LEFT BLACK LENTICULAR BRACKET + "" +% PRESENTATION FORM FOR VERTICAL RIGHT BLACK LENTICULAR BRACKET + "" +% PRESENTATION FORM FOR VERTICAL LEFT DOUBLE ANGLE BRACKET + "" +% PRESENTATION FORM FOR VERTICAL RIGHT DOUBLE ANGLE BRACKET + "" +% PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET + "" +% PRESENTATION FORM FOR VERTICAL RIGHT ANGLE BRACKET + "" +% PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET + "" +% PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET + "" +% PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET + "" +% PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET + "" +% PRESENTATION FORM FOR VERTICAL LEFT SQUARE BRACKET + "" +% PRESENTATION FORM FOR VERTICAL RIGHT SQUARE BRACKET + "" % DASHED OVERLINE "" % CENTRELINE OVERLINE @@ -1324,6 +1790,104 @@ translit_start "" % WAVY LOW LINE "" +% DIGIT ZERO FULL STOP + "" +% DIGIT ZERO COMMA + "" +% DIGIT ONE COMMA + "" +% DIGIT TWO COMMA + "" +% DIGIT THREE COMMA + "" +% DIGIT FOUR COMMA + "" +% DIGIT FIVE COMMA + "" +% DIGIT SIX COMMA + "" +% DIGIT SEVEN COMMA + "" +% DIGIT EIGHT COMMA + "" +% DIGIT NINE COMMA + "" +% PARENTHESIZED LATIN CAPITAL LETTER A + "" +% PARENTHESIZED LATIN CAPITAL LETTER B + "" +% PARENTHESIZED LATIN CAPITAL LETTER C + "" +% PARENTHESIZED LATIN CAPITAL LETTER D + "" +% PARENTHESIZED LATIN CAPITAL LETTER E + "" +% PARENTHESIZED LATIN CAPITAL LETTER F + "" +% PARENTHESIZED LATIN CAPITAL LETTER G + "" +% PARENTHESIZED LATIN CAPITAL LETTER H + "" +% PARENTHESIZED LATIN CAPITAL LETTER I + "" +% PARENTHESIZED LATIN CAPITAL LETTER J + "" +% PARENTHESIZED LATIN CAPITAL LETTER K + "" +% PARENTHESIZED LATIN CAPITAL LETTER L + "" +% PARENTHESIZED LATIN CAPITAL LETTER M + "" +% PARENTHESIZED LATIN CAPITAL LETTER N + "" +% PARENTHESIZED LATIN CAPITAL LETTER O + "" +% PARENTHESIZED LATIN CAPITAL LETTER P + "" +% PARENTHESIZED LATIN CAPITAL LETTER Q + "" +% PARENTHESIZED LATIN CAPITAL LETTER R + "" +% PARENTHESIZED LATIN CAPITAL LETTER S + "" +% PARENTHESIZED LATIN CAPITAL LETTER T + "" +% PARENTHESIZED LATIN CAPITAL LETTER U + "" +% PARENTHESIZED LATIN CAPITAL LETTER V + "" +% PARENTHESIZED LATIN CAPITAL LETTER W + "" +% PARENTHESIZED LATIN CAPITAL LETTER X + "" +% PARENTHESIZED LATIN CAPITAL LETTER Y + "" +% PARENTHESIZED LATIN CAPITAL LETTER Z + "" +% TORTOISE SHELL BRACKETED LATIN CAPITAL LETTER S + "" +% RAISED MC SIGN + "" +% RAISED MD SIGN + "" +% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-672C + "" +% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-4E09 + "" +% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-4E8C + "" +% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-5B89 + "" +% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-70B9 + "" +% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6253 + "" +% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-76D7 + "" +% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-52DD + "" +% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6557 + "" translit_end diff --git a/localedata/locales/translit_font b/localedata/locales/translit_font index 9347bd4..65e0d90 100644 --- a/localedata/locales/translit_font +++ b/localedata/locales/translit_font @@ -2,9 +2,7 @@ escape_char / comment_char % % Transliterations of font equivalents. -% Generated through -% $ grep '^[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;' UnicodeData.txt | \ -% sed -e 's/^\([^;]*\);\([^;]*\);[^;]*;[^;]*;[^;]*; \([^;]*\);.*$/ % \2/' +% Generated automatically from UnicodeData.txt by gen_translit_font.py on 2015-06-10 for Unicode 7.0.0. LC_CTYPE @@ -37,6 +35,7 @@ translit_start % SCRIPT CAPITAL M % SCRIPT SMALL O % INFORMATION SOURCE + % DOUBLE-STRUCK SMALL PI % DOUBLE-STRUCK SMALL GAMMA % DOUBLE-STRUCK CAPITAL GAMMA % DOUBLE-STRUCK CAPITAL PI @@ -238,6 +237,7 @@ translit_start % MATHEMATICAL SCRIPT SMALL I % MATHEMATICAL SCRIPT SMALL J % MATHEMATICAL SCRIPT SMALL K + % MATHEMATICAL SCRIPT SMALL L % MATHEMATICAL SCRIPT SMALL M % MATHEMATICAL SCRIPT SMALL N % MATHEMATICAL SCRIPT SMALL P @@ -707,6 +707,8 @@ translit_start % MATHEMATICAL MONOSPACE SMALL X % MATHEMATICAL MONOSPACE SMALL Y % MATHEMATICAL MONOSPACE SMALL Z + % MATHEMATICAL ITALIC SMALL DOTLESS I + % MATHEMATICAL ITALIC SMALL DOTLESS J % MATHEMATICAL BOLD CAPITAL ALPHA % MATHEMATICAL BOLD CAPITAL BETA % MATHEMATICAL BOLD CAPITAL GAMMA @@ -997,6 +999,8 @@ translit_start % MATHEMATICAL SANS-SERIF BOLD ITALIC PHI SYMBOL % MATHEMATICAL SANS-SERIF BOLD ITALIC RHO SYMBOL % MATHEMATICAL SANS-SERIF BOLD ITALIC PI SYMBOL + % MATHEMATICAL BOLD CAPITAL DIGAMMA + % MATHEMATICAL BOLD SMALL DIGAMMA % MATHEMATICAL BOLD DIGIT ZERO % MATHEMATICAL BOLD DIGIT ONE % MATHEMATICAL BOLD DIGIT TWO @@ -1047,6 +1051,147 @@ translit_start % MATHEMATICAL MONOSPACE DIGIT SEVEN % MATHEMATICAL MONOSPACE DIGIT EIGHT % MATHEMATICAL MONOSPACE DIGIT NINE + % ARABIC MATHEMATICAL ALEF + % ARABIC MATHEMATICAL BEH + % ARABIC MATHEMATICAL JEEM + % ARABIC MATHEMATICAL DAL + % ARABIC MATHEMATICAL WAW + % ARABIC MATHEMATICAL ZAIN + % ARABIC MATHEMATICAL HAH + % ARABIC MATHEMATICAL TAH + % ARABIC MATHEMATICAL YEH + % ARABIC MATHEMATICAL KAF + % ARABIC MATHEMATICAL LAM + % ARABIC MATHEMATICAL MEEM + % ARABIC MATHEMATICAL NOON + % ARABIC MATHEMATICAL SEEN + % ARABIC MATHEMATICAL AIN + % ARABIC MATHEMATICAL FEH + % ARABIC MATHEMATICAL SAD + % ARABIC MATHEMATICAL QAF + % ARABIC MATHEMATICAL REH + % ARABIC MATHEMATICAL SHEEN + % ARABIC MATHEMATICAL TEH + % ARABIC MATHEMATICAL THEH + % ARABIC MATHEMATICAL KHAH + % ARABIC MATHEMATICAL THAL + % ARABIC MATHEMATICAL DAD + % ARABIC MATHEMATICAL ZAH + % ARABIC MATHEMATICAL GHAIN + % ARABIC MATHEMATICAL DOTLESS BEH + % ARABIC MATHEMATICAL DOTLESS NOON + % ARABIC MATHEMATICAL DOTLESS FEH + % ARABIC MATHEMATICAL DOTLESS QAF + % ARABIC MATHEMATICAL INITIAL BEH + % ARABIC MATHEMATICAL INITIAL JEEM + % ARABIC MATHEMATICAL INITIAL HEH + % ARABIC MATHEMATICAL INITIAL HAH + % ARABIC MATHEMATICAL INITIAL YEH + % ARABIC MATHEMATICAL INITIAL KAF + % ARABIC MATHEMATICAL INITIAL LAM + % ARABIC MATHEMATICAL INITIAL MEEM + % ARABIC MATHEMATICAL INITIAL NOON + % ARABIC MATHEMATICAL INITIAL SEEN + % ARABIC MATHEMATICAL INITIAL AIN + % ARABIC MATHEMATICAL INITIAL FEH + % ARABIC MATHEMATICAL INITIAL SAD + % ARABIC MATHEMATICAL INITIAL QAF + % ARABIC MATHEMATICAL INITIAL SHEEN + % ARABIC MATHEMATICAL INITIAL TEH + % ARABIC MATHEMATICAL INITIAL THEH + % ARABIC MATHEMATICAL INITIAL KHAH + % ARABIC MATHEMATICAL INITIAL DAD + % ARABIC MATHEMATICAL INITIAL GHAIN + % ARABIC MATHEMATICAL TAILED JEEM + % ARABIC MATHEMATICAL TAILED HAH + % ARABIC MATHEMATICAL TAILED YEH + % ARABIC MATHEMATICAL TAILED LAM + % ARABIC MATHEMATICAL TAILED NOON + % ARABIC MATHEMATICAL TAILED SEEN + % ARABIC MATHEMATICAL TAILED AIN + % ARABIC MATHEMATICAL TAILED SAD + % ARABIC MATHEMATICAL TAILED QAF + % ARABIC MATHEMATICAL TAILED SHEEN + % ARABIC MATHEMATICAL TAILED KHAH + % ARABIC MATHEMATICAL TAILED DAD + % ARABIC MATHEMATICAL TAILED GHAIN + % ARABIC MATHEMATICAL TAILED DOTLESS NOON + % ARABIC MATHEMATICAL TAILED DOTLESS QAF + % ARABIC MATHEMATICAL STRETCHED BEH + % ARABIC MATHEMATICAL STRETCHED JEEM + % ARABIC MATHEMATICAL STRETCHED HEH + % ARABIC MATHEMATICAL STRETCHED HAH + % ARABIC MATHEMATICAL STRETCHED TAH + % ARABIC MATHEMATICAL STRETCHED YEH + % ARABIC MATHEMATICAL STRETCHED KAF + % ARABIC MATHEMATICAL STRETCHED MEEM + % ARABIC MATHEMATICAL STRETCHED NOON + % ARABIC MATHEMATICAL STRETCHED SEEN + % ARABIC MATHEMATICAL STRETCHED AIN + % ARABIC MATHEMATICAL STRETCHED FEH + % ARABIC MATHEMATICAL STRETCHED SAD + % ARABIC MATHEMATICAL STRETCHED QAF + % ARABIC MATHEMATICAL STRETCHED SHEEN + % ARABIC MATHEMATICAL STRETCHED TEH + % ARABIC MATHEMATICAL STRETCHED THEH + % ARABIC MATHEMATICAL STRETCHED KHAH + % ARABIC MATHEMATICAL STRETCHED DAD + % ARABIC MATHEMATICAL STRETCHED ZAH + % ARABIC MATHEMATICAL STRETCHED GHAIN + % ARABIC MATHEMATICAL STRETCHED DOTLESS BEH + % ARABIC MATHEMATICAL STRETCHED DOTLESS FEH + % ARABIC MATHEMATICAL LOOPED ALEF + % ARABIC MATHEMATICAL LOOPED BEH + % ARABIC MATHEMATICAL LOOPED JEEM + % ARABIC MATHEMATICAL LOOPED DAL + % ARABIC MATHEMATICAL LOOPED HEH + % ARABIC MATHEMATICAL LOOPED WAW + % ARABIC MATHEMATICAL LOOPED ZAIN + % ARABIC MATHEMATICAL LOOPED HAH + % ARABIC MATHEMATICAL LOOPED TAH + % ARABIC MATHEMATICAL LOOPED YEH + % ARABIC MATHEMATICAL LOOPED LAM + % ARABIC MATHEMATICAL LOOPED MEEM + % ARABIC MATHEMATICAL LOOPED NOON + % ARABIC MATHEMATICAL LOOPED SEEN + % ARABIC MATHEMATICAL LOOPED AIN + % ARABIC MATHEMATICAL LOOPED FEH + % ARABIC MATHEMATICAL LOOPED SAD + % ARABIC MATHEMATICAL LOOPED QAF + % ARABIC MATHEMATICAL LOOPED REH + % ARABIC MATHEMATICAL LOOPED SHEEN + % ARABIC MATHEMATICAL LOOPED TEH + % ARABIC MATHEMATICAL LOOPED THEH + % ARABIC MATHEMATICAL LOOPED KHAH + % ARABIC MATHEMATICAL LOOPED THAL + % ARABIC MATHEMATICAL LOOPED DAD + % ARABIC MATHEMATICAL LOOPED ZAH + % ARABIC MATHEMATICAL LOOPED GHAIN + % ARABIC MATHEMATICAL DOUBLE-STRUCK BEH + % ARABIC MATHEMATICAL DOUBLE-STRUCK JEEM + % ARABIC MATHEMATICAL DOUBLE-STRUCK DAL + % ARABIC MATHEMATICAL DOUBLE-STRUCK WAW + % ARABIC MATHEMATICAL DOUBLE-STRUCK ZAIN + % ARABIC MATHEMATICAL DOUBLE-STRUCK HAH + % ARABIC MATHEMATICAL DOUBLE-STRUCK TAH + % ARABIC MATHEMATICAL DOUBLE-STRUCK YEH + % ARABIC MATHEMATICAL DOUBLE-STRUCK LAM + % ARABIC MATHEMATICAL DOUBLE-STRUCK MEEM + % ARABIC MATHEMATICAL DOUBLE-STRUCK NOON + % ARABIC MATHEMATICAL DOUBLE-STRUCK SEEN + % ARABIC MATHEMATICAL DOUBLE-STRUCK AIN + % ARABIC MATHEMATICAL DOUBLE-STRUCK FEH + % ARABIC MATHEMATICAL DOUBLE-STRUCK SAD + % ARABIC MATHEMATICAL DOUBLE-STRUCK QAF + % ARABIC MATHEMATICAL DOUBLE-STRUCK REH + % ARABIC MATHEMATICAL DOUBLE-STRUCK SHEEN + % ARABIC MATHEMATICAL DOUBLE-STRUCK TEH + % ARABIC MATHEMATICAL DOUBLE-STRUCK THEH + % ARABIC MATHEMATICAL DOUBLE-STRUCK KHAH + % ARABIC MATHEMATICAL DOUBLE-STRUCK THAL + % ARABIC MATHEMATICAL DOUBLE-STRUCK DAD + % ARABIC MATHEMATICAL DOUBLE-STRUCK ZAH + % ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN translit_end diff --git a/localedata/locales/translit_fraction b/localedata/locales/translit_fraction index 50dbd78..30f2843 100644 --- a/localedata/locales/translit_fraction +++ b/localedata/locales/translit_fraction @@ -2,10 +2,7 @@ escape_char / comment_char % % Transliterations of fractions. -% Generated through -% $ grep '^[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;' UnicodeData.txt | \ -% sed -e 's/^\([^;]*\);\([^;]*\);[^;]*;[^;]*;[^;]*; \([^;]*\);.*$/ ""% \2/' -e 'h' -e 's/^\([^%]*\)% .*$/\1/' -e 's/\([0-9A-F]\) \([0-9A-F]\)/\1> "";"" % VULGAR FRACTION THREE QUARTERS "";"" +% VULGAR FRACTION ONE SEVENTH + "";"" +% VULGAR FRACTION ONE NINTH + "";"" +% VULGAR FRACTION ONE TENTH + "";"" % VULGAR FRACTION ONE THIRD "";"" % VULGAR FRACTION TWO THIRDS @@ -44,7 +47,9 @@ translit_start % VULGAR FRACTION SEVEN EIGHTHS "";"" % FRACTION NUMERATOR ONE - "";"" + "";"" +% VULGAR FRACTION ZERO THIRDS + "";"" translit_end diff --git a/localedata/unicode-gen/Makefile b/localedata/unicode-gen/Makefile index 166ee31..920bf0e 100644 --- a/localedata/unicode-gen/Makefile +++ b/localedata/unicode-gen/Makefile @@ -41,7 +41,7 @@ PYTHON3 = python3 WGET = wget DOWNLOADS = UnicodeData.txt DerivedCoreProperties.txt EastAsianWidth.txt -GENERATED = i18n UTF-8 +GENERATED = i18n UTF-8 translit_combining translit_compat translit_circle translit_cjk_compat translit_font translit_fraction REPORTS = i18n-report UTF-8-report all: $(GENERATED) @@ -51,6 +51,12 @@ check: check-i18n check-UTF-8 install: cp -p i18n ../locales/i18n cp -p UTF-8 ../charmaps/UTF-8 + cp -p translit_combining ../locales/translit_combining + cp -p translit_compat ../locales/translit_compat + cp -p translit_circle ../locales/translit_circle + cp -p translit_cjk_compat ../locales/translit_cjk_compat + cp -p translit_font ../locales/translit_font + cp -p translit_fraction ../locales/translit_fraction clean: mostlyclean -rm -rf __pycache__ @@ -82,13 +88,43 @@ UTF-8: utf8_gen.py UTF-8-report: UTF-8 ../charmaps/UTF-8 UTF-8-report: utf8_compatibility.py - $(PYTHON3) ./utf8_compatibility.py -o ../charmaps/UTF-8 \ - -n UTF-8 -a -m > $@ + $(PYTHON3) ./utf8_compatibility.py -u UnicodeData.txt \ + -e EastAsianWidth.txt -o ../charmaps/UTF-8 \ + -n UTF-8 -a -m -c > $@ check-UTF-8: UTF-8-report @if grep '^Total.*: [^0]' UTF-8-report; \ then echo manual verification required; false; else true; fi +translit_combining: UnicodeData.txt +translit_combining: gen_translit_combining.py + $(PYTHON3) ./gen_translit_combining.py -u UnicodeData.txt \ + -o $@ --unicode_version $(UNICODE_VERSION) + +translit_compat: UnicodeData.txt +translit_compat: gen_translit_compat.py + $(PYTHON3) ./gen_translit_compat.py -u UnicodeData.txt \ + -o $@ --unicode_version $(UNICODE_VERSION) + +translit_circle: UnicodeData.txt +translit_circle: gen_translit_circle.py + $(PYTHON3) ./gen_translit_circle.py -u UnicodeData.txt \ + -o $@ --unicode_version $(UNICODE_VERSION) + +translit_cjk_compat: UnicodeData.txt +translit_cjk_compat: gen_translit_cjk_compat.py + $(PYTHON3) ./gen_translit_cjk_compat.py -u UnicodeData.txt \ + -o $@ --unicode_version $(UNICODE_VERSION) + +translit_font: UnicodeData.txt +translit_font: gen_translit_font.py + $(PYTHON3) ./gen_translit_font.py -u UnicodeData.txt \ + -o $@ --unicode_version $(UNICODE_VERSION) + +translit_fraction: UnicodeData.txt +translit_fraction: gen_translit_fraction.py + $(PYTHON3) ./gen_translit_fraction.py -u UnicodeData.txt \ + -o $@ --unicode_version $(UNICODE_VERSION) .PHONY: downloads clean-downloads downloads: $(DOWNLOADS) diff --git a/localedata/unicode-gen/gen_translit_circle.py b/localedata/unicode-gen/gen_translit_circle.py new file mode 100755 index 0000000..a146e7f --- /dev/null +++ b/localedata/unicode-gen/gen_translit_circle.py @@ -0,0 +1,149 @@ +#!/usr/bin/python3 +# +# Generate a translit_circle file from a UnicodeData file. +# Copyright (C) 2015 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# . + +''' +Generate a translit_circle file from UnicodeData.txt + +To see how this script is used, call it with the “-h” option: + + $ ./gen_translit_circle -h + … prints usage message … +''' + +import argparse +import time +import unicode_utils + +def read_input_file(filename): + '''Reads the original glibc translit_circle file to get the + original head and tail. + + We want to replace only the part of the file between + “translit_start” and “translit_end” + ''' + head = tail = '' + with open(filename, mode='r') as translit_file: + for line in translit_file: + head = head + line + if line.startswith('translit_start'): + break + for line in translit_file: + if line.startswith('translit_end'): + tail = line + break + for line in translit_file: + tail = tail + line + return (head, tail) + +def output_head(translit_file, unicode_version, head=''): + '''Write the header of the output file, i.e. the part of the file + before the “translit_start” line. + ''' + if ARGS.input_file and head: + translit_file.write(head) + else: + translit_file.write('escape_char /\n') + translit_file.write('comment_char %\n') + translit_file.write('\n') + translit_file.write('% Transliterations of encircled characters.\n') + translit_file.write('% Generated automatically from UnicodeData.txt ' + + 'by gen_translit_circle.py ' + + 'on {:s} '.format(time.strftime('%Y-%m-%d')) + + 'for Unicode {:s}.\n'.format(unicode_version)) + translit_file.write('\n') + translit_file.write('LC_CTYPE\n') + translit_file.write('\n') + translit_file.write('translit_start\n') + +def output_tail(translit_file, tail=''): + '''Write the tail of the output file''' + if ARGS.input_file and tail: + translit_file.write(tail) + else: + translit_file.write('translit_end\n') + translit_file.write('\n') + translit_file.write('END LC_CTYPE\n') + +def output_transliteration(translit_file): + '''Write the new transliteration to the output file''' + translit_file.write('\n') + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): + name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] + decomposition = unicode_utils.UNICODE_ATTRIBUTES[ + code_point]['decomposition'] + if decomposition.startswith(''): + decomposition = decomposition[9:] + decomposed_code_points = [int(x, 16) + for x in decomposition.split(' ')] + translit_file.write('% {:s}\n'.format(name)) + translit_file.write('{:s} "'.format( + unicode_utils.ucs_symbol(code_point))) + for decomposed_code_point in decomposed_code_points: + translit_file.write('{:s}'.format( + unicode_utils.ucs_symbol(decomposed_code_point))) + translit_file.write('"\n') + translit_file.write('\n') + + +if __name__ == "__main__": + PARSER = argparse.ArgumentParser( + description=''' + Generate a translit_circle file from UnicodeData.txt. + ''') + PARSER.add_argument( + '-u', '--unicode_data_file', + nargs='?', + type=str, + default='UnicodeData.txt', + help=('The UnicodeData.txt file to read, ' + + 'default: %(default)s')) + PARSER.add_argument( + '-i', '--input_file', + nargs='?', + type=str, + help=''' The original glibc/localedata/locales/translit_combining + file.''') + PARSER.add_argument( + '-o', '--output_file', + nargs='?', + type=str, + default='translit_circle.new', + help='''The new translit_circle file, default: %(default)s. If the + original glibc/localedata/locales/translit_circle file has + been given as an option, the header up to the + “translit_start” line and the tail from the “translit_end” + line to the end of the file will be copied unchanged into the + output file. ''') + PARSER.add_argument( + '--unicode_version', + nargs='?', + required=True, + type=str, + help='The Unicode version of the input files used.') + ARGS = PARSER.parse_args() + + unicode_utils.fill_attributes(ARGS.unicode_data_file) + HEAD = TAIL = '' + if ARGS.input_file: + (HEAD, TAIL) = read_input_file(ARGS.input_file) + with open(ARGS.output_file, mode='w') as TRANSLIT_FILE: + output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD) + output_transliteration(TRANSLIT_FILE) + output_tail(TRANSLIT_FILE, tail=TAIL) diff --git a/localedata/unicode-gen/gen_translit_cjk_compat.py b/localedata/unicode-gen/gen_translit_cjk_compat.py new file mode 100755 index 0000000..a87d546 --- /dev/null +++ b/localedata/unicode-gen/gen_translit_cjk_compat.py @@ -0,0 +1,219 @@ +#!/usr/bin/python3 +# +# Generate a translit_cjk_compat file from a UnicodeData file. +# Copyright (C) 2015 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# . + +''' +Generate a translit_cjk_compat file from UnicodeData.txt + +To see how this script is used, call it with the “-h” option: + + $ ./gen_translit_cjk_compat -h + … prints usage message … +''' + +import argparse +import time +import sys +import unicode_utils + +def read_input_file(filename): + '''Reads the original glibc translit_cjk_compat file to get the + original head and tail. + + We want to replace only the part of the file between + “translit_start” and “translit_end” + ''' + head = tail = '' + with open(filename, mode='r') as translit_file: + for line in translit_file: + head = head + line + if line.startswith('translit_start'): + break + for line in translit_file: + if line.startswith('translit_end'): + tail = line + break + for line in translit_file: + tail = tail + line + return (head, tail) + +def output_head(translit_file, unicode_version, head=''): + '''Write the header of the output file, i.e. the part of the file + before the “translit_start” line. + ''' + if ARGS.input_file and head: + translit_file.write(head) + else: + translit_file.write('escape_char /\n') + translit_file.write('comment_char %\n') + translit_file.write('\n') + translit_file.write('% Transliterations of CJK compatibility ') + translit_file.write('characters.\n') + translit_file.write('% Generated automatically from UnicodeData.txt ' + + 'by gen_translit_cjk_compat.py ' + + 'on {:s} '.format(time.strftime('%Y-%m-%d')) + + 'for Unicode {:s}.\n'.format(unicode_version)) + translit_file.write('\n') + translit_file.write('LC_CTYPE\n') + translit_file.write('\n') + translit_file.write('translit_start\n') + +def output_tail(translit_file, tail=''): + '''Write the tail of the output file''' + if ARGS.input_file and tail: + translit_file.write(tail) + else: + translit_file.write('translit_end\n') + translit_file.write('\n') + translit_file.write('END LC_CTYPE\n') + +def special_decompose(code_point_list): + ''' + Decompositions which are not in UnicodeData.txt at all but which + were used in the original translit_cjk_compat file in glibc and + which seem to make sense. I want to keep the update of + translit_cjk_compat close to the spirit of the original file, + therefore I added this special decomposition rules here. + ''' + special_decompose_dict = { + (0x2215,): [0x002F], # ∕ → / + (0x00B2,): [0x005E, 0x0032], # ² → ^2 + (0x03BC,): [0x00B5], # μ → µ (GREEK SMALL LETTER MU → MICRO SIGN) + (0x2113,): [0x006C], # ℓ → l + (0x00B3,): [0x005E, 0x0033], # ³ → ^3 + (0x00B5,): [0x0075], # µ → u + (0x03BC, 0x2113): [0x03BC, 0x006C], # μℓ → μl + (0x0072, 0x0061, 0x0064, 0x2215, 0x0073, 0x00B2): [ + 0x0072, 0x0061, 0x0064, 0x002F, 0x0073, 0x00B2], + (0x006D, 0x2215, 0x0073, 0x00B2): [0x006D, 0x002F, 0x0073, 0x00B2], + } + if tuple(code_point_list) in special_decompose_dict: + return special_decompose_dict[tuple(code_point_list)] + else: + return code_point_list + +def output_transliteration(translit_file): + '''Write the new transliteration to the output file''' + translit_file.write('\n') + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): + name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] + decomposition = unicode_utils.UNICODE_ATTRIBUTES[ + code_point]['decomposition'] + if decomposition.startswith(''): + decomposition = decomposition[9:] + decomposed_code_points = [[int(x, 16) + for x in decomposition.split(' ')]] + if decomposed_code_points[0]: + while True: + special_decomposed_code_points = special_decompose( + decomposed_code_points[-1]) + if (special_decomposed_code_points + != decomposed_code_points[-1]): + decomposed_code_points.append( + special_decomposed_code_points) + continue + special_decomposed_code_points = [] + for decomposed_code_point in decomposed_code_points[-1]: + special_decomposed_code_points += special_decompose( + [decomposed_code_point]) + if (special_decomposed_code_points + == decomposed_code_points[-1]): + break + decomposed_code_points.append( + special_decomposed_code_points) + translit_file.write('% {:s}\n'.format(name)) + translit_file.write('{:s} '.format( + unicode_utils.ucs_symbol(code_point))) + for index in range(0, len(decomposed_code_points)): + if index > 0: + translit_file.write(';') + if len(decomposed_code_points[index]) > 1: + translit_file.write('"') + for decomposed_code_point in decomposed_code_points[index]: + translit_file.write('{:s}'.format( + unicode_utils.ucs_symbol(decomposed_code_point))) + if len(decomposed_code_points[index]) > 1: + translit_file.write('"') + translit_file.write('\n') + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): + name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] + decomposition = unicode_utils.UNICODE_ATTRIBUTES[ + code_point]['decomposition'] + if decomposition and name.startswith('CJK COMPATIBILITY IDEOGRAPH'): + decomposed_code_points = [int(x, 16) + for x in decomposition.split(' ')] + if len(decomposed_code_points) != 1: + sys.stderr.write( + 'Unexpected decomposition length {:x} {:s} {:s}\n'.format( + code_point, name, decomposition)) + exit(1) + translit_file.write('% {:s}\n'.format(name)) + translit_file.write('{:s} '.format( + unicode_utils.ucs_symbol(code_point))) + for decomposed_code_point in decomposed_code_points: + translit_file.write('{:s}'.format( + unicode_utils.ucs_symbol(decomposed_code_point))) + translit_file.write('\n') + translit_file.write('\n') + +if __name__ == "__main__": + PARSER = argparse.ArgumentParser( + description=''' + Generate a translit_cjk_compat file from UnicodeData.txt. + ''') + PARSER.add_argument( + '-u', '--unicode_data_file', + nargs='?', + type=str, + default='UnicodeData.txt', + help=('The UnicodeData.txt file to read, ' + + 'default: %(default)s')) + PARSER.add_argument( + '-i', '--input_file', + nargs='?', + type=str, + help=''' The original glibc/localedata/locales/translit_cjk_compat + file.''') + PARSER.add_argument( + '-o', '--output_file', + nargs='?', + type=str, + default='translit_cjk_compat.new', + help='''The new translit_cjk_compat file, default: %(default)s. If the + original glibc/localedata/locales/translit_cjk_compat file has + been given as an option, the header up to the + “translit_start” line and the tail from the “translit_end” + line to the end of the file will be copied unchanged into the + output file. ''') + PARSER.add_argument( + '--unicode_version', + nargs='?', + required=True, + type=str, + help='The Unicode version of the input files used.') + ARGS = PARSER.parse_args() + + unicode_utils.fill_attributes(ARGS.unicode_data_file) + HEAD = TAIL = '' + if ARGS.input_file: + (HEAD, TAIL) = read_input_file(ARGS.input_file) + with open(ARGS.output_file, mode='w') as TRANSLIT_FILE: + output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD) + output_transliteration(TRANSLIT_FILE) + output_tail(TRANSLIT_FILE, tail=TAIL) diff --git a/localedata/unicode-gen/gen_translit_combining.py b/localedata/unicode-gen/gen_translit_combining.py new file mode 100755 index 0000000..07ed739 --- /dev/null +++ b/localedata/unicode-gen/gen_translit_combining.py @@ -0,0 +1,441 @@ +#!/usr/bin/python3 +# +# Generate a translit_combining file from a UnicodeData file. +# Copyright (C) 2015 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# . + +''' +Generate a translit_combining file from UnicodeData.txt + +To see how this script is used, call it with the “-h” option: + + $ ./gen_translit_combining -h + … prints usage message … +''' + +import argparse +import time +import unicode_utils + +def read_input_file(filename): + '''Reads the original glibc translit_combining file to get the + original head and tail. + + We want to replace only the part of the file between + “translit_start” and “translit_end” + ''' + head = tail = '' + with open(filename, mode='r') as translit_file: + for line in translit_file: + head = head + line + if line.startswith('translit_start'): + break + for line in translit_file: + if line.startswith('translit_end'): + tail = line + break + for line in translit_file: + tail = tail + line + return (head, tail) + +def output_head(translit_file, unicode_version, head=''): + '''Write the header of the output file, i.e. the part of the file + before the “translit_start” line. + ''' + if ARGS.input_file and head: + translit_file.write(head) + else: + translit_file.write('escape_char /\n') + translit_file.write('comment_char %\n') + translit_file.write('\n') + translit_file.write('% Transliterations that remove all ') + translit_file.write('combining characters (accents,\n') + translit_file.write('% pronounciation marks, etc.).\n') + translit_file.write('% Generated automatically from UnicodeData.txt ' + + 'by gen_translit_combining.py ' + + 'on {:s} '.format(time.strftime('%Y-%m-%d')) + + 'for Unicode {:s}.\n'.format(unicode_version)) + translit_file.write('\n') + translit_file.write('LC_CTYPE\n') + translit_file.write('\n') + translit_file.write('translit_start\n') + +def output_tail(translit_file, tail=''): + '''Write the tail of the output file''' + if ARGS.input_file and tail: + translit_file.write(tail) + else: + translit_file.write('translit_end\n') + translit_file.write('\n') + translit_file.write('END LC_CTYPE\n') + +def is_combining_remove(code_point): + '''Check whether this is a combining character which should be listed + in the section of the translit_combining file where combining + characters are replaced by empty strings. + + We ignore combining characters from many scripts here because + the original translit_combining file didn’t do this for the + combining characters from these scripts either and I am not + sure yet whether this would be useful to do for all combining + characters or not. For the moment I think it is better to keep + close to the spirit of the original file. + ''' + if not unicode_utils.is_combining(code_point): + return False + name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] + for substring in ('DEVANAGARI', + 'BENGALI', + 'CYRILLIC', + 'SYRIAC', + 'THAANA', + 'NKO', + 'GURMUKHI', + 'TAMIL', + 'GUJARATI', + 'ORIYA', + 'TELUGU', + 'KANNADA', + 'MALAYALAM', + 'SINHALA', + 'THAI', + 'LAO', + 'TIBETAN', + 'MYANMAR', + 'ETHIOPIC', + 'TAGALOG', + 'HANUNOO', + 'BUHID', + 'TAGBANWA', + 'KHMER', + 'MONGOLIAN', + 'LIMBU', + 'NEW TAI LUE', + 'BUGINESE', + 'BALINESE', + 'SUNDANESE', + 'LEPCHA', + 'IDEOGRAPHIC', + 'HANGUL', + 'SYLOTI', + 'SAURASHTRA', + 'KAYAH', + 'REJANG', + 'CHAM', + 'VARIATION SELECTOR', + 'KHAROSHTHI', + 'MUSICAL SYMBOL', + 'SAMARITAN', + 'MANDAIC', + 'TAI THAM', + 'BATAK', + 'VEDIC', + 'COPTIC', + 'TIFINAGH', + 'BAMUM', + 'JAVANESE', + 'TAI VIET', + 'MEETEI', + 'MANICHAEAN', + 'BRAHMI', + 'KAITHI', + 'CHAKMA', + 'MAHAJANI', + 'SHARADA', + 'KHOJKI', + 'KHUDAWADI', + 'GRANTHA', + 'TIRHUTA', + 'SIDDHAM', + 'MODI VOWEL', + 'MODI SIGN', + 'TAKRI', + 'BASSA VAH', + 'PAHAWH HMONG', + 'MIAO', + 'DUPLOYAN', + 'MENDE KIKAKUI' + ): + if substring in name: + return False + return True + +def canonical_decompose(code_point): + '''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings + + In some instances a canonical mapping or a compatibility mapping + may consist of a single character. For a canonical mapping, this + indicates that the character is a canonical equivalent of another + single character. For a compatibility mapping, this indicates that + the character is a compatibility equivalent of another single + character. + + A canonical mapping may also consist of a pair of characters, but + is never longer than two characters. When a canonical mapping + consists of a pair of characters, the first character may itself + be a character with a decomposition mapping, but the second + character never has a decomposition mapping. + + We ignore the canonical decomposition for code points + matching certain substrings because the original translit_combining + file didn’t include these types of characters either. I am unsure + about the usefulness of including them and want to keep close + to the spirit of the original file for the moment. + ''' + name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] + for substring in ('MUSICAL SYMBOL', + 'CJK COMPATIBILITY IDEOGRAPH', + 'BALINESE', + 'KAITHI LETTER', + 'CHAKMA VOWEL', + 'GRANTHA VOWEL', + 'TIRHUTA VOWEL', + 'SIDDHAM VOWEL'): + if substring in name: + return [] + decomposition = unicode_utils.UNICODE_ATTRIBUTES[ + code_point]['decomposition'] + if decomposition and not decomposition.startswith('<'): + decomposed_code_points = [int(x, 16) for x in decomposition.split(' ')] + if decomposed_code_points: + cd0 = canonical_decompose(decomposed_code_points[0]) + if cd0: + decomposed_code_points = cd0 + decomposed_code_points[1:] + return decomposed_code_points + else: + return [] + +def special_decompose(code_point_list): + ''' + Decompositions which are not canonical or which are not in + UnicodeData.txt at all but some of these were used in the original + translit_combining file in glibc and they seemed to make sense. + I want to keep the update of translit_combining close to the + spirit of the original file, therefore I added these special + decomposition rules here. + ''' + special_decompose_dict = { + # Ø U+00D8 is already handled in translit_neutral. But + # translit_combining is usually included after translit_neutral + # and Ǿ U+01FE LATIN CAPITAL LETTER O WITH STROKE AND ACUTE + # has a canonical decomposition to Ø U+00D8 and we want to + # further decompose this to U+004F. + (0x00D8,): [0x004F], # Ø → O + # ø U+00F8 is already handled in translit_neutral. But + # translit_combining is usually included after translit_neutral + # and ǿ U+01FF LATIN SMALL LETTER O WITH STROKE AND ACUTE + # has a canonical decomposition to ø U+00F8 and we want to + # further decompose this to U+006F. + (0x00F8,): [0x006F], # ø → o + # æ U+00E6 is already in translit_compat because ligatures + # are handled in translit_compat. But ǣ U+01E3 has a + # canonical decomposition to U+00E6, U+0304 and we want to + # further decompose this to “ae”. + (0x00E6,): [0x0061, 0x0065], # æ → ae + # Æ U+00C6 is already in translit_compat because ligatures + # are handled in translit_compat. But Ǣ U+01E2 has a + # canonical decomposition to U+00C6, U+0304 and we want to + # further decompose this to “AE” + (0x00C6,): [0x0041, 0x0045], # Æ → AE + # U+05F2 HEBREW LIGATURE YIDDISH DOUBLE YOD is already in + # translit_compat because ligatures are handled in translit_compat. + # But U+FB1F has a canonical decomposition to U+05F2 and + # we want to further decompose this to U+05D9, U+05D9. + (0x05F2,): [0x05D9, 0x05D9], # ײ → יי + # 0x2002 has a decomposition to 0x0020 in UnicodeData.txt + # But U+2000 EN QUAD has a canonical decomposition U+2002 + # and we want to further decompose this to U+0020. + (0x2002,): [0x0020], # EN SPACE → SPACE + # 0x2003 has a decomposition to 0x0020 in UnicodeData.txt + # But U+2001 EM QUAD has a canonical decomposition to U+2003 + # and we want to further decompose this to U+0020. + (0x2003,): [0x0020], # EM SPACE → SPACE + # U+2260 ≠ has the canonical decomposition U+003D U+0338 + # (= followed by ̸). After stripping the combining characters, + # the result is only = which reverses the meaning. + # Therefore, we add a special rules here for such mathematical + # negations: + (0x21AE,): [0x0021, 0x003C, 0x002D, 0x003E], # ↮ → !<-> + (0x21CD,): [0x0021, 0x003C, 0x003D], # ⇍ → !<= + (0x21CE,): [0x0021, 0x003C, 0x003D, 0x003E], # ⇎ → !<=> + (0x21CF,): [0x0021, 0x003D, 0x003E], # ⇏ → !=> + (0x2204,): [0x0021, 0x2203], # ∄ → !∃ + (0x2209,): [0x0021, 0x2208], # ∉ → !∈ + (0x220C,): [0x0021, 0x220B], # ∌ → !∋ + (0x2224,): [0x0021, 0x2223], # ∤ → !∣ + (0x2226,): [0x0021, 0x2225], # ∦ → !∥ + (0x2241,): [0x0021, 0x007E], # ≁ → !~ + (0x2244,): [0x0021, 0x007E, 0x002D], # ≄ → !~- + (0x2247,): [0x0021, 0x007E, 0x003D ], # ≇ → !~= + (0x2249,): [0x0021, 0x007E, 0x007E], # ≉ → !~~ + (0x2260,): [0x0021, 0x003D], # ≠ → != + (0x2262,): [0x0021, 0x003D, 0x003D], # ≢ → !== + (0x226D,): [0x0021, 0x224D], # ≭ → !≍ + (0x226E,): [0x0021, 0x003C], # ≮ → !< + (0x226F,): [0x0021, 0x003E], # ≯ → !> + (0x2270,): [0x0021, 0x003C, 0x003D], # ≰ → !<= + (0x2271,): [0x0021, 0x003E, 0x003D], # ≱ → !>= + (0x2274,): [0x0021, 0x003C, 0x007E], # ≴ → !<~ + (0x2275,): [0x0021, 0x003E, 0x007E], # ≵ → !>~ + (0x2278,): [0x0021, 0x003C, 0x003E], # ≸ → !<> + (0x2279,): [0x0021, 0x003E, 0x003C], # ≹ → !>< + (0x2280,): [0x0021, 0x227A], # ⊀ → !≺ + (0x2281,): [0x0021, 0x227B], # ⊁ → !≻ + (0x2284,): [0x0021, 0x2282], # ⊄ → !⊂ + (0x2285,): [0x0021, 0x2283], # ⊅ → !⊃ + (0x2288,): [0x0021, 0x2282, 0x003D], # ⊈ → !⊂= + (0x2289,): [0x0021, 0x2283, 0x003D], # ⊉ → !⊃= + (0x22AC,): [0x0021, 0x22A2], # ⊬ → !⊢ + (0x22AD,): [0x0021, 0x22A8], # ⊭ → !⊨ + (0x22AE,): [0x0021, 0x22A9], # ⊮ → !⊩ + (0x22AF,): [0x0021, 0x22AB], # ⊯ → !⊫ + (0x22E0,): [0x0021, 0x227C], # ⋠ → !≼ + (0x22E1,): [0x0021, 0x227D], # ⋡ → !≽ + (0x22E2,): [0x0021, 0x2291], # ⋢ → !⊑ + (0x22E3,): [0x0021, 0x2292], # ⋣ → !⊒ + (0x22EA,): [0x0021, 0x22B2], # ⋪ → !⊲ + (0x22EB,): [0x0021, 0x22B3], # ⋫ → !⊳ + (0x22EC,): [0x0021, 0x22B4], # ⋬ → !⊴ + (0x22ED,): [0x0021, 0x22B5], # ⋭ → !⊵ + (0x2ADC,): [0x0021, 0x2ADD], # ⫝̸ → !⫝ + # Special rule for 〈 U+3008 is added + # because 〉 U+2329 has the canonical decomposition U+3008 + # and we want to further decompose this to > U+003C. + (0x3008,): [0x003C], # 〈 → < + # Special rule for 〉 U+3009 is added + # because 〉 U+232A has the canonical decomposition U+3009 + # and we want to further decompose this to < U+003E. + (0x3009,): [0x003E], # 〉→ > + } + if tuple(code_point_list) in special_decompose_dict: + return special_decompose_dict[tuple(code_point_list)] + else: + return code_point_list + +def output_combining_remove(translit_file): + '''Write the section of the translit_combining file where combining + characters are replaced by empty strings. + ''' + translit_file.write('\n') + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): + name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] + if is_combining_remove(code_point): + translit_file.write('% {:s}\n'.format(name)) + translit_file.write('{:s} ""\n'.format( + unicode_utils.ucs_symbol(code_point))) + translit_file.write('\n') + +def output_decompositions(translit_file): + '''Write the section of the translit_combining file where characters + characters are decomposed and combining characters stripped from + the decompositions. + ''' + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): + if special_decompose([code_point]) != [code_point]: + decomposed_code_points = [special_decompose([code_point])] + else: + decomposed_code_points = [canonical_decompose(code_point)] + if decomposed_code_points[0]: + while True: + special_decomposed_code_points = special_decompose( + decomposed_code_points[-1]) + if (special_decomposed_code_points + != decomposed_code_points[-1]): + decomposed_code_points.append( + special_decomposed_code_points) + continue + special_decomposed_code_points = [] + for decomposed_code_point in decomposed_code_points[-1]: + special_decomposed_code_points += special_decompose( + [decomposed_code_point]) + if (special_decomposed_code_points + == decomposed_code_points[-1]): + break + decomposed_code_points.append( + special_decomposed_code_points) + for index in range(0, len(decomposed_code_points)): + decomposed_code_points[index] = [ + x for x in decomposed_code_points[index] + if not is_combining_remove(x)] + if decomposed_code_points[0]: + translit_file.write('% {:s}\n'.format( + unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'])) + translit_file.write('{:s} '.format( + unicode_utils.ucs_symbol(code_point))) + for index in range(0, len(decomposed_code_points)): + if index > 0: + translit_file.write(';') + if len(decomposed_code_points[index]) > 1: + translit_file.write('"') + for decomposed_code_point in decomposed_code_points[index]: + translit_file.write('{:s}'.format( + unicode_utils.ucs_symbol(decomposed_code_point))) + if len(decomposed_code_points[index]) > 1: + translit_file.write('"') + translit_file.write('\n') + translit_file.write('\n') + +def output_transliteration(translit_file): + '''Write the new transliteration to the output file''' + output_combining_remove(translit_file) + output_decompositions(translit_file) + +if __name__ == "__main__": + PARSER = argparse.ArgumentParser( + description=''' + Generate a translit_combining file from UnicodeData.txt. + ''') + PARSER.add_argument( + '-u', '--unicode_data_file', + nargs='?', + type=str, + default='UnicodeData.txt', + help=('The UnicodeData.txt file to read, ' + + 'default: %(default)s')) + PARSER.add_argument( + '-i', '--input_file', + nargs='?', + type=str, + help=''' The original glibc/localedata/locales/translit_combining + file.''') + PARSER.add_argument( + '-o', '--output_file', + nargs='?', + type=str, + default='translit_combining.new', + help='''The new translit_combining file, default: %(default)s. If the + original glibc/localedata/locales/translit_combining file has + been given as an option, the header up to the + “translit_start” line and the tail from the “translit_end” + line to the end of the file will be copied unchanged into the + output file. ''') + PARSER.add_argument( + '--unicode_version', + nargs='?', + required=True, + type=str, + help='The Unicode version of the input files used.') + ARGS = PARSER.parse_args() + + unicode_utils.fill_attributes(ARGS.unicode_data_file) + HEAD = TAIL = '' + if ARGS.input_file: + (HEAD, TAIL) = read_input_file(ARGS.input_file) + with open(ARGS.output_file, mode='w') as TRANSLIT_FILE: + output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD) + output_transliteration(TRANSLIT_FILE) + output_tail(TRANSLIT_FILE, tail=TAIL) diff --git a/localedata/unicode-gen/gen_translit_compat.py b/localedata/unicode-gen/gen_translit_compat.py new file mode 100755 index 0000000..d99e56d --- /dev/null +++ b/localedata/unicode-gen/gen_translit_compat.py @@ -0,0 +1,325 @@ +#!/usr/bin/python3 +# +# Generate a translit_compat file from a UnicodeData file. +# Copyright (C) 2015 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# . + +''' +Generate a translit_compat file from UnicodeData.txt + +To see how this script is used, call it with the “-h” option: + + $ ./gen_translit_compat -h + … prints usage message … +''' + +import argparse +import time +import unicode_utils + +def read_input_file(filename): + '''Reads the original glibc translit_compat file to get the + original head and tail. + + We want to replace only the part of the file between + “translit_start” and “translit_end” + ''' + head = tail = '' + with open(filename, mode='r') as translit_file: + for line in translit_file: + head = head + line + if line.startswith('translit_start'): + break + for line in translit_file: + if line.startswith('translit_end'): + tail = line + break + for line in translit_file: + tail = tail + line + return (head, tail) + +def output_head(translit_file, unicode_version, head=''): + '''Write the header of the output file, i.e. the part of the file + before the “translit_start” line. + ''' + if ARGS.input_file and head: + translit_file.write(head) + else: + translit_file.write('escape_char /\n') + translit_file.write('comment_char %\n') + translit_file.write('\n') + translit_file.write('% Transliterations of compatibility characters ') + translit_file.write('and ligatures.\n') + translit_file.write('% Generated automatically from UnicodeData.txt ' + + 'by gen_translit_compat.py ' + + 'on {:s} '.format(time.strftime('%Y-%m-%d')) + + 'for Unicode {:s}.\n'.format(unicode_version)) + translit_file.write('\n') + translit_file.write('LC_CTYPE\n') + translit_file.write('\n') + translit_file.write('translit_start\n') + +def output_tail(translit_file, tail=''): + '''Write the tail of the output file''' + if ARGS.input_file and tail: + translit_file.write(tail) + else: + translit_file.write('translit_end\n') + translit_file.write('\n') + translit_file.write('END LC_CTYPE\n') + +def compatibility_decompose(code_point): + '''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings + + “The compatibility decomposition is formed by recursively applying + the canonical and compatibility mappings, then applying the + Canonical Ordering Algorithm.” + + We don’t do the canonical decomposition here because this is + done in gen_translit_combining.py to generate translit_combining. + + And we ignore some of the possible compatibility formatting tags + here. Some of them are used in other translit_* files, not + translit_compat: + + : translit_font + : translit_circle + : translit_wide + : translit_narrow + : translit_cjk_compat + : translit_fraction + + And we ignore + + , , , , + + because they seem to be not useful for transliteration. + ''' + decomposition = unicode_utils.UNICODE_ATTRIBUTES[ + code_point]['decomposition'] + compatibility_tags = ( + '', '', '', '') + for compatibility_tag in compatibility_tags: + if decomposition.startswith(compatibility_tag): + decomposition = decomposition[len(compatibility_tag)+1:] + decomposed_code_points = [int(x, 16) + for x in decomposition.split(' ')] + if (len(decomposed_code_points) > 1 + and decomposed_code_points[0] == 0x0020 + and decomposed_code_points[1] >= 0x0300 + and decomposed_code_points[1] <= 0x03FF): + # Decomposes into a space followed by a combining character. + # This is not useful fo transliteration. + return [] + else: + return_value = [] + for index in range(0, len(decomposed_code_points)): + cd_code_points = compatibility_decompose( + decomposed_code_points[index]) + if cd_code_points: + return_value += cd_code_points + else: + return_value += [decomposed_code_points[index]] + return return_value + return [] + +def special_decompose(code_point_list): + ''' + Decompositions which are not in UnicodeData.txt at all but which + were used in the original translit_compat file in glibc and + which seem to make sense. I want to keep the update of + translit_compat close to the spirit of the original file, + therefore I added this special decomposition rules here. + ''' + special_decompose_dict = { + (0x03BC,): [0x0075], # μ → u + (0x02BC,): [0x0027], # ʼ → ' + } + if tuple(code_point_list) in special_decompose_dict: + return special_decompose_dict[tuple(code_point_list)] + else: + return code_point_list + +def special_ligature_decompose(code_point): + ''' + Decompositions for ligatures which are not in UnicodeData.txt at + all but which were used in the original translit_compat file in + glibc and which seem to make sense. I want to keep the update of + translit_compat close to the spirit of the original file, + therefore I added these special ligature decomposition rules here. + + ''' + special_ligature_decompose_dict = { + 0x00E6: [0x0061, 0x0065], # æ → ae + 0x00C6: [0x0041, 0x0045], # Æ → AE + # These following 5 special ligature decompositions were + # in the original glibc/localedata/locales/translit_compat file + 0x0152: [0x004F, 0x0045], # Œ → OE + 0x0153: [0x006F, 0x0065], # œ → oe + 0x05F0: [0x05D5, 0x05D5], # װ → וו + 0x05F1: [0x05D5, 0x05D9], # ױ → וי + 0x05F2: [0x05D9, 0x05D9], # ײ → יי + # The following special ligature decompositions were + # not in the original glibc/localedata/locales/translit_compat file + # U+04A4 CYRILLIC CAPITAL LIGATURE EN GHE + # → U+041D CYRILLIC CAPITAL LETTER EN, + # U+0413 CYRILLIC CAPITAL LETTER GHE + 0x04A4: [0x041D, 0x0413], # Ҥ → НГ + # U+04A5 CYRILLIC SMALL LIGATURE EN GHE + # → U+043D CYRILLIC SMALL LETTER EN, + # U+0433 CYRILLIC SMALL LETTER GHE + 0x04A5: [0x043D, 0x0433], # ҥ → нг + # U+04B4 CYRILLIC CAPITAL LIGATURE TE TSE + # → U+0422 CYRILLIC CAPITAL LETTER TE, + # U+0426 CYRILLIC CAPITAL LETTER TSE + 0x04B4: [0x0422, 0x0426], # Ҵ → ТЦ + # U+04B5 CYRILLIC SMALL LIGATURE TE TSE + # → U+0442 CYRILLIC SMALL LETTER TE, + # U+0446 CYRILLIC SMALL LETTER TSE + 0x04B5: [0x0442, 0x0446], # ҵ → тц + # U+04d4 CYRILLIC CAPITAL LIGATURE A IE + # → U+0410 CYRILLIC CAPITAL LETTER A + # U+0415;CYRILLIC CAPITAL LETTER IE + 0x04D4: [0x0410, 0x0415], # Ӕ → АЕ + # U+04D5 CYRILLIC SMALL LIGATURE A IE + # → U+0430 CYRILLIC SMALL LETTER A, + # U+0435 CYRILLIC SMALL LETTER IE + 0x04D5: [0x0430, 0x0435], # ӕ → ае + # I am not sure what to do with the following ligatures + # maybe it makes no sense to decompose them: + # U+0616 ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH + # U+06d6 ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA + # U+06d7 ARABIC SMALL HIGH LIGATURE QAF WITH LAM WITH ALEF MAKSURA + # U+fdfd ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM + # U+fe20 COMBINING LIGATURE LEFT HALF + # U+fe21 COMBINING LIGATURE RIGHT HALF + # U+fe27 COMBINING LIGATURE LEFT HALF BELOW + # U+fe28 COMBINING LIGATURE RIGHT HALF BELOW + # U+11176 MAHAJANI LIGATURE SHRI + # U+1f670 SCRIPT LIGATURE ET ORNAMENT + # U+1f671 HEAVY SCRIPT LIGATURE ET ORNAMENT + # U+1f672 LIGATURE OPEN ET ORNAMENT + # U+1f673 HEAVY LIGATURE OPEN ET ORNAMENT + } + if code_point in special_ligature_decompose_dict: + return special_ligature_decompose_dict[code_point] + else: + return [code_point] + +def output_transliteration(translit_file): + '''Write the new transliteration to the output file''' + translit_file.write('\n') + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): + name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] + decomposed_code_points = [compatibility_decompose(code_point)] + if not decomposed_code_points[0]: + if special_decompose([code_point]) != [code_point]: + decomposed_code_points[0] = special_decompose([code_point]) + else: + special_decomposed_code_points = [] + while True: + special_decomposed_code_points = special_decompose( + decomposed_code_points[-1]) + if (special_decomposed_code_points + != decomposed_code_points[-1]): + decomposed_code_points.append( + special_decomposed_code_points) + continue + special_decomposed_code_points = [] + for decomposed_code_point in decomposed_code_points[-1]: + special_decomposed_code_points += special_decompose( + [decomposed_code_point]) + if (special_decomposed_code_points + == decomposed_code_points[-1]): + break + decomposed_code_points.append( + special_decomposed_code_points) + if decomposed_code_points[0]: + translit_file.write('% {:s}\n'.format(name)) + translit_file.write('{:s} '.format( + unicode_utils.ucs_symbol(code_point))) + for index in range(0, len(decomposed_code_points)): + if index > 0: + translit_file.write(';') + translit_file.write('"') + for decomposed_code_point in decomposed_code_points[index]: + translit_file.write('{:s}'.format( + unicode_utils.ucs_symbol(decomposed_code_point))) + translit_file.write('"') + translit_file.write('\n') + elif 'LIGATURE' in name and 'ARABIC' not in name: + decomposed_code_points = special_ligature_decompose(code_point) + if decomposed_code_points[0] != code_point: + translit_file.write('% {:s}\n'.format(name)) + translit_file.write('{:s} '.format( + unicode_utils.ucs_symbol(code_point))) + translit_file.write('"') + for decomposed_code_point in decomposed_code_points: + translit_file.write('{:s}'.format( + unicode_utils.ucs_symbol(decomposed_code_point))) + translit_file.write('"') + translit_file.write('\n') + else: + print('Warning: unhandled ligature: {:x} {:s}'.format( + code_point, name)) + translit_file.write('\n') + +if __name__ == "__main__": + PARSER = argparse.ArgumentParser( + description=''' + Generate a translit_compat file from UnicodeData.txt. + ''') + PARSER.add_argument( + '-u', '--unicode_data_file', + nargs='?', + type=str, + default='UnicodeData.txt', + help=('The UnicodeData.txt file to read, ' + + 'default: %(default)s')) + PARSER.add_argument( + '-i', '--input_file', + nargs='?', + type=str, + help=''' The original glibc/localedata/locales/translit_compat + file.''') + PARSER.add_argument( + '-o', '--output_file', + nargs='?', + type=str, + default='translit_compat.new', + help='''The new translit_compat file, default: %(default)s. If the + original glibc/localedata/locales/translit_compat file has + been given as an option, the header up to the + “translit_start” line and the tail from the “translit_end” + line to the end of the file will be copied unchanged into the + output file. ''') + PARSER.add_argument( + '--unicode_version', + nargs='?', + required=True, + type=str, + help='The Unicode version of the input files used.') + ARGS = PARSER.parse_args() + + unicode_utils.fill_attributes(ARGS.unicode_data_file) + HEAD = TAIL = '' + if ARGS.input_file: + (HEAD, TAIL) = read_input_file(ARGS.input_file) + with open(ARGS.output_file, mode='w') as TRANSLIT_FILE: + output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD) + output_transliteration(TRANSLIT_FILE) + output_tail(TRANSLIT_FILE, tail=TAIL) diff --git a/localedata/unicode-gen/gen_translit_font.py b/localedata/unicode-gen/gen_translit_font.py new file mode 100755 index 0000000..c7ec509 --- /dev/null +++ b/localedata/unicode-gen/gen_translit_font.py @@ -0,0 +1,155 @@ +#!/usr/bin/python3 +# +# Generate a translit_font file from a UnicodeData file. +# Copyright (C) 2015 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# . + +''' +Generate a translit_font file from UnicodeData.txt + +To see how this script is used, call it with the “-h” option: + + $ ./gen_translit_font -h + … prints usage message … +''' + +import argparse +import time +import unicode_utils + +def read_input_file(filename): + '''Reads the original glibc translit_font file to get the + original head and tail. + + We want to replace only the part of the file between + “translit_start” and “translit_end” + ''' + head = tail = '' + with open(filename, mode='r') as translit_file: + for line in translit_file: + head = head + line + if line.startswith('translit_start'): + break + for line in translit_file: + if line.startswith('translit_end'): + tail = line + break + for line in translit_file: + tail = tail + line + return (head, tail) + +def output_head(translit_file, unicode_version, head=''): + '''Write the header of the output file, i.e. the part of the file + before the “translit_start” line. + ''' + if ARGS.input_file and head: + translit_file.write(head) + else: + translit_file.write('escape_char /\n') + translit_file.write('comment_char %\n') + translit_file.write('\n') + translit_file.write('% Transliterations of font equivalents.\n') + translit_file.write('% Generated automatically from UnicodeData.txt ' + + 'by gen_translit_font.py ' + + 'on {:s} '.format(time.strftime('%Y-%m-%d')) + + 'for Unicode {:s}.\n'.format(unicode_version)) + translit_file.write('\n') + translit_file.write('LC_CTYPE\n') + translit_file.write('\n') + translit_file.write('translit_start\n') + +def output_tail(translit_file, tail=''): + '''Write the tail of the output file''' + if ARGS.input_file and tail: + translit_file.write(tail) + else: + translit_file.write('translit_end\n') + translit_file.write('\n') + translit_file.write('END LC_CTYPE\n') + +def output_transliteration(translit_file): + '''Write the new transliteration to the output file''' + translit_file.write('\n') + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): + name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] + decomposition = unicode_utils.UNICODE_ATTRIBUTES[ + code_point]['decomposition'] + if decomposition.startswith(''): + decomposition = decomposition[7:] + decomposed_code_points = [[int(x, 16) + for x in decomposition.split(' ')]] + if decomposed_code_points[0]: + translit_file.write('{:s} '.format( + unicode_utils.ucs_symbol(code_point))) + for index in range(0, len(decomposed_code_points)): + if index > 0: + translit_file.write(';') + if len(decomposed_code_points[index]) > 1: + translit_file.write('"') + for decomposed_code_point in decomposed_code_points[index]: + translit_file.write('{:s}'.format( + unicode_utils.ucs_symbol(decomposed_code_point))) + if len(decomposed_code_points[index]) > 1: + translit_file.write('"') + translit_file.write(' % {:s}\n'.format(name)) + translit_file.write('\n') + +if __name__ == "__main__": + PARSER = argparse.ArgumentParser( + description=''' + Generate a translit_font file from UnicodeData.txt. + ''') + PARSER.add_argument( + '-u', '--unicode_data_file', + nargs='?', + type=str, + default='UnicodeData.txt', + help=('The UnicodeData.txt file to read, ' + + 'default: %(default)s')) + PARSER.add_argument( + '-i', '--input_file', + nargs='?', + type=str, + help=''' The original glibc/localedata/locales/translit_font + file.''') + PARSER.add_argument( + '-o', '--output_file', + nargs='?', + type=str, + default='translit_font.new', + help='''The new translit_font file, default: %(default)s. If the + original glibc/localedata/locales/translit_font file has + been given as an option, the header up to the + “translit_start” line and the tail from the “translit_end” + line to the end of the file will be copied unchanged into the + output file. ''') + PARSER.add_argument( + '--unicode_version', + nargs='?', + required=True, + type=str, + help='The Unicode version of the input files used.') + ARGS = PARSER.parse_args() + + unicode_utils.fill_attributes(ARGS.unicode_data_file) + HEAD = TAIL = '' + if ARGS.input_file: + (HEAD, TAIL) = read_input_file(ARGS.input_file) + with open(ARGS.output_file, mode='w') as TRANSLIT_FILE: + output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD) + output_transliteration(TRANSLIT_FILE) + output_tail(TRANSLIT_FILE, tail=TAIL) diff --git a/localedata/unicode-gen/gen_translit_fraction.py b/localedata/unicode-gen/gen_translit_fraction.py new file mode 100755 index 0000000..bf460f2 --- /dev/null +++ b/localedata/unicode-gen/gen_translit_fraction.py @@ -0,0 +1,196 @@ +#!/usr/bin/python3 +# +# Generate a translit_fraction file from a UnicodeData file. +# Copyright (C) 2015 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# . + +''' +Generate a translit_fraction file from UnicodeData.txt + +To see how this script is used, call it with the “-h” option: + + $ ./gen_translit_fraction -h + … prints usage message … +''' + +import argparse +import time +import unicode_utils + +def read_input_file(filename): + '''Reads the original glibc translit_fraction file to get the + original head and tail. + + We want to replace only the part of the file between + “translit_start” and “translit_end” + ''' + head = tail = '' + with open(filename, mode='r') as translit_file: + for line in translit_file: + head = head + line + if line.startswith('translit_start'): + break + for line in translit_file: + if line.startswith('translit_end'): + tail = line + break + for line in translit_file: + tail = tail + line + return (head, tail) + +def output_head(translit_file, unicode_version, head=''): + '''Write the header of the output file, i.e. the part of the file + before the “translit_start” line. + ''' + if ARGS.input_file and head: + translit_file.write(head) + else: + translit_file.write('escape_char /\n') + translit_file.write('comment_char %\n') + translit_file.write('\n') + translit_file.write('% Transliterations of fractions.\n') + translit_file.write('% Generated automatically from UnicodeData.txt ' + + 'by gen_translit_fraction.py ' + + 'on {:s} '.format(time.strftime('%Y-%m-%d')) + + 'for Unicode {:s}.\n'.format(unicode_version)) + translit_file.write('% The replacements have been surrounded ') + translit_file.write('with spaces, because fractions are\n') + translit_file.write('% often preceded by a decimal number and ') + translit_file.write('followed by a unit or a math symbol.\n') + translit_file.write('\n') + translit_file.write('LC_CTYPE\n') + translit_file.write('\n') + translit_file.write('translit_start\n') + +def output_tail(translit_file, tail=''): + '''Write the tail of the output file''' + if ARGS.input_file and tail: + translit_file.write(tail) + else: + translit_file.write('translit_end\n') + translit_file.write('\n') + translit_file.write('END LC_CTYPE\n') + +def special_decompose(code_point_list): + ''' + Decompositions which are not in UnicodeData.txt at all but which + were used in the original translit_fraction file in glibc and + which seem to make sense. I want to keep the update of + translit_fraction close to the spirit of the original file, + therefore I added this special decomposition rules here. + ''' + special_decompose_dict = { + (0x2044,): [0x002F], # ⁄ → / + } + if tuple(code_point_list) in special_decompose_dict: + return special_decompose_dict[tuple(code_point_list)] + else: + return code_point_list + +def output_transliteration(translit_file): + '''Write the new transliteration to the output file''' + translit_file.write('\n') + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): + name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] + decomposition = unicode_utils.UNICODE_ATTRIBUTES[ + code_point]['decomposition'] + if decomposition.startswith(''): + decomposition = decomposition[11:] + decomposed_code_points = [[int(x, 16) + for x in decomposition.split(' ')]] + if decomposed_code_points[0]: + decomposed_code_points[0] = [0x0020] \ + + decomposed_code_points[0] \ + + [0x0020] + while True: + special_decomposed_code_points = special_decompose( + decomposed_code_points[-1]) + if (special_decomposed_code_points + != decomposed_code_points[-1]): + decomposed_code_points.append( + special_decomposed_code_points) + continue + special_decomposed_code_points = [] + for decomposed_code_point in decomposed_code_points[-1]: + special_decomposed_code_points += special_decompose( + [decomposed_code_point]) + if (special_decomposed_code_points + == decomposed_code_points[-1]): + break + decomposed_code_points.append( + special_decomposed_code_points) + translit_file.write('% {:s}\n'.format(name)) + translit_file.write('{:s} '.format( + unicode_utils.ucs_symbol(code_point))) + for index in range(0, len(decomposed_code_points)): + if index > 0: + translit_file.write(';') + if len(decomposed_code_points[index]) > 1: + translit_file.write('"') + for decomposed_code_point in decomposed_code_points[index]: + translit_file.write('{:s}'.format( + unicode_utils.ucs_symbol(decomposed_code_point))) + if len(decomposed_code_points[index]) > 1: + translit_file.write('"') + translit_file.write('\n') + translit_file.write('\n') + +if __name__ == "__main__": + PARSER = argparse.ArgumentParser( + description=''' + Generate a translit_cjk_compat file from UnicodeData.txt. + ''') + PARSER.add_argument( + '-u', '--unicode_data_file', + nargs='?', + type=str, + default='UnicodeData.txt', + help=('The UnicodeData.txt file to read, ' + + 'default: %(default)s')) + PARSER.add_argument( + '-i', '--input_file', + nargs='?', + type=str, + help=''' The original glibc/localedata/locales/translit_fraction + file.''') + PARSER.add_argument( + '-o', '--output_file', + nargs='?', + type=str, + default='translit_fraction.new', + help='''The new translit_fraction file, default: %(default)s. If the + original glibc/localedata/locales/translit_fraction file has + been given as an option, the header up to the + “translit_start” line and the tail from the “translit_end” + line to the end of the file will be copied unchanged into the + output file. ''') + PARSER.add_argument( + '--unicode_version', + nargs='?', + required=True, + type=str, + help='The Unicode version of the input files used.') + ARGS = PARSER.parse_args() + + unicode_utils.fill_attributes(ARGS.unicode_data_file) + HEAD = TAIL = '' + if ARGS.input_file: + (HEAD, TAIL) = read_input_file(ARGS.input_file) + with open(ARGS.output_file, mode='w') as TRANSLIT_FILE: + output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD) + output_transliteration(TRANSLIT_FILE) + output_tail(TRANSLIT_FILE, tail=TAIL) diff --git a/localedata/unicode-gen/gen_unicode_ctype.py b/localedata/unicode-gen/gen_unicode_ctype.py index 0c74f2a..0f064f5 100755 --- a/localedata/unicode-gen/gen_unicode_ctype.py +++ b/localedata/unicode-gen/gen_unicode_ctype.py @@ -30,345 +30,9 @@ To see how this script is used, call it with the “-h” option: ''' import argparse -import sys import time import re - -# Dictionary holding the entire contents of the UnicodeData.txt file -# -# Contents of this dictionary look like this: -# -# {0: {'category': 'Cc', -# 'title': None, -# 'digit': '', -# 'name': '', -# 'bidi': 'BN', -# 'combining': '0', -# 'comment': '', -# 'oldname': 'NULL', -# 'decomposition': '', -# 'upper': None, -# 'mirrored': 'N', -# 'lower': None, -# 'decdigit': '', -# 'numeric': ''}, -# … -# } -UNICODE_ATTRIBUTES = {} - -# Dictionary holding the entire contents of the DerivedCoreProperties.txt file -# -# Contents of this dictionary look like this: -# -# {917504: ['Default_Ignorable_Code_Point'], -# 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'], -# … -# } -DERIVED_CORE_PROPERTIES = {} - -def fill_attribute(code_point, fields): - '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields. - - One entry in the UNICODE_ATTRIBUTES dictionary represents one line - in the UnicodeData.txt file. - - ''' - UNICODE_ATTRIBUTES[code_point] = { - 'name': fields[1], # Character name - 'category': fields[2], # General category - 'combining': fields[3], # Canonical combining classes - 'bidi': fields[4], # Bidirectional category - 'decomposition': fields[5], # Character decomposition mapping - 'decdigit': fields[6], # Decimal digit value - 'digit': fields[7], # Digit value - 'numeric': fields[8], # Numeric value - 'mirrored': fields[9], # mirrored - 'oldname': fields[10], # Old Unicode 1.0 name - 'comment': fields[11], # comment - # Uppercase mapping - 'upper': int(fields[12], 16) if fields[12] else None, - # Lowercase mapping - 'lower': int(fields[13], 16) if fields[13] else None, - # Titlecase mapping - 'title': int(fields[14], 16) if fields[14] else None, - } - -def fill_attributes(filename): - '''Stores the entire contents of the UnicodeData.txt file - in the UNICODE_ATTRIBUTES dictionary. - - A typical line for a single code point in UnicodeData.txt looks - like this: - - 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; - - Code point ranges are indicated by pairs of lines like this: - - 4E00;;Lo;0;L;;;;;N;;;;; - 9FCC;;Lo;0;L;;;;;N;;;;; - ''' - with open(filename, mode='r') as unicode_data_file: - fields_start = [] - for line in unicode_data_file: - fields = line.strip().split(';') - if len(fields) != 15: - sys.stderr.write( - 'short line in file "%(f)s": %(l)s\n' %{ - 'f': filename, 'l': line}) - exit(1) - if fields[2] == 'Cs': - # Surrogates are UTF-16 artefacts, - # not real characters. Ignore them. - fields_start = [] - continue - if fields[1].endswith(', First>'): - fields_start = fields - fields_start[1] = fields_start[1].split(',')[0][1:] - continue - if fields[1].endswith(', Last>'): - fields[1] = fields[1].split(',')[0][1:] - if fields[1:] != fields_start[1:]: - sys.stderr.write( - 'broken code point range in file "%(f)s": %(l)s\n' %{ - 'f': filename, 'l': line}) - exit(1) - for code_point in range( - int(fields_start[0], 16), - int(fields[0], 16)+1): - fill_attribute(code_point, fields) - fields_start = [] - continue - fill_attribute(int(fields[0], 16), fields) - fields_start = [] - -def fill_derived_core_properties(filename): - '''Stores the entire contents of the DerivedCoreProperties.txt file - in the DERIVED_CORE_PROPERTIES dictionary. - - Lines in DerivedCoreProperties.txt are either a code point range like - this: - - 0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z - - or a single code point like this: - - 00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR - - ''' - with open(filename, mode='r') as derived_core_properties_file: - for line in derived_core_properties_file: - match = re.match( - r'^(?P[0-9A-F]{4,6})' - + r'(?:\.\.(?P[0-9A-F]{4,6}))?' - + r'\s*;\s*(?P[a-zA-Z_]+)', - line) - if not match: - continue - start = match.group('codepoint1') - end = match.group('codepoint2') - if not end: - end = start - for code_point in range(int(start, 16), int(end, 16)+1): - prop = match.group('property') - if code_point in DERIVED_CORE_PROPERTIES: - DERIVED_CORE_PROPERTIES[code_point].append(prop) - else: - DERIVED_CORE_PROPERTIES[code_point] = [prop] - -def to_upper(code_point): - '''Returns the code point of the uppercase version - of the given code point''' - if (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['upper']): - return UNICODE_ATTRIBUTES[code_point]['upper'] - else: - return code_point - -def to_lower(code_point): - '''Returns the code point of the lowercase version - of the given code point''' - if (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['lower']): - return UNICODE_ATTRIBUTES[code_point]['lower'] - else: - return code_point - -def to_title(code_point): - '''Returns the code point of the titlecase version - of the given code point''' - if (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['title']): - return UNICODE_ATTRIBUTES[code_point]['title'] - else: - return code_point - -def is_upper(code_point): - '''Checks whether the character with this code point is uppercase''' - return (to_lower(code_point) != code_point - or (code_point in DERIVED_CORE_PROPERTIES - and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point])) - -def is_lower(code_point): - '''Checks whether the character with this code point is lowercase''' - # Some characters are defined as “Lowercase” in - # DerivedCoreProperties.txt but do not have a mapping to upper - # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is - # one of these. - return (to_upper(code_point) != code_point - # is lowercase, but without simple to_upper mapping. - or code_point == 0x00DF - or (code_point in DERIVED_CORE_PROPERTIES - and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point])) - -def is_alpha(code_point): - '''Checks whether the character with this code point is alphabetic''' - return ((code_point in DERIVED_CORE_PROPERTIES - and - 'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point]) - or - # Consider all the non-ASCII digits as alphabetic. - # ISO C 99 forbids us to have them in category “digit”, - # but we want iswalnum to return true on them. - (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd' - and not (code_point >= 0x0030 and code_point <= 0x0039))) - -def is_digit(code_point): - '''Checks whether the character with this code point is a digit''' - if False: - return (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd') - # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without - # a zero. Must add <0> in front of them by hand. - else: - # SUSV2 gives us some freedom for the "digit" category, but ISO C 99 - # takes it away: - # 7.25.2.1.5: - # The iswdigit function tests for any wide character that - # corresponds to a decimal-digit character (as defined in 5.2.1). - # 5.2.1: - # the 10 decimal digits 0 1 2 3 4 5 6 7 8 9 - return (code_point >= 0x0030 and code_point <= 0x0039) - -def is_outdigit(code_point): - '''Checks whether the character with this code point is outdigit''' - return (code_point >= 0x0030 and code_point <= 0x0039) - -def is_blank(code_point): - '''Checks whether the character with this code point is blank''' - return (code_point == 0x0009 # '\t' - # Category Zs without mention of '' - or (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs' - and '' not in - UNICODE_ATTRIBUTES[code_point]['decomposition'])) - -def is_space(code_point): - '''Checks whether the character with this code point is a space''' - # Don’t make U+00A0 a space. Non-breaking space means that all programs - # should treat it like a punctuation character, not like a space. - return (code_point == 0x0020 # ' ' - or code_point == 0x000C # '\f' - or code_point == 0x000A # '\n' - or code_point == 0x000D # '\r' - or code_point == 0x0009 # '\t' - or code_point == 0x000B # '\v' - # Categories Zl, Zp, and Zs without mention of "" - or (UNICODE_ATTRIBUTES[code_point]['name'] - and - (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp'] - or - (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs'] - and - '' not in - UNICODE_ATTRIBUTES[code_point]['decomposition'])))) - -def is_cntrl(code_point): - '''Checks whether the character with this code point is - a control character''' - return (UNICODE_ATTRIBUTES[code_point]['name'] - and (UNICODE_ATTRIBUTES[code_point]['name'] == '' - or - UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp'])) - -def is_xdigit(code_point): - '''Checks whether the character with this code point is - a hexadecimal digit''' - if False: - return (is_digit(code_point) - or (code_point >= 0x0041 and code_point <= 0x0046) - or (code_point >= 0x0061 and code_point <= 0x0066)) - else: - # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99 - # takes it away: - # 7.25.2.1.12: - # The iswxdigit function tests for any wide character that - # corresponds to a hexadecimal-digit character (as defined - # in 6.4.4.1). - # 6.4.4.1: - # hexadecimal-digit: one of - # 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F - return ((code_point >= 0x0030 and code_point <= 0x0039) - or (code_point >= 0x0041 and code_point <= 0x0046) - or (code_point >= 0x0061 and code_point <= 0x0066)) - -def is_graph(code_point): - '''Checks whether the character with this code point is - a graphical character''' - return (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['name'] != '' - and not is_space(code_point)) - -def is_print(code_point): - '''Checks whether the character with this code point is printable''' - return (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['name'] != '' - and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp']) - -def is_punct(code_point): - '''Checks whether the character with this code point is punctuation''' - if False: - return (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P')) - else: - # The traditional POSIX definition of punctuation is every graphic, - # non-alphanumeric character. - return (is_graph(code_point) - and not is_alpha(code_point) - and not is_digit(code_point)) - -def is_combining(code_point): - '''Checks whether the character with this code point is - a combining character''' - # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt - # file. In 3.0.1 it was identical to the union of the general categories - # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the - # PropList.txt file, so we take the latter definition. - return (UNICODE_ATTRIBUTES[code_point]['name'] - and - UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me']) - -def is_combining_level3(code_point): - '''Checks whether the character with this code point is - a combining level3 character''' - return (is_combining(code_point) - and - int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200)) - -def ucs_symbol(code_point): - '''Return the UCS symbol string for a Unicode character.''' - if code_point < 0x10000: - return ''.format(code_point) - else: - return ''.format(code_point) - -def ucs_symbol_range(code_point_low, code_point_high): - '''Returns a string UCS symbol string for a code point range. - - Example: - - .. - ''' - return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high) +import unicode_utils def code_point_ranges(is_class_function): '''Returns a list of ranges of code points for which is_class_function @@ -379,7 +43,7 @@ def code_point_ranges(is_class_function): [[65, 90], [192, 214], [216, 222], [256], … ] ''' cp_ranges = [] - for code_point in sorted(UNICODE_ATTRIBUTES): + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): if is_class_function(code_point): if (cp_ranges and cp_ranges[-1][-1] == code_point - 1): @@ -413,9 +77,9 @@ def output_charclass(i18n_file, class_name, is_class_function): if line.strip(): line += ';' if len(code_point_range) == 1: - range_string = ucs_symbol(code_point_range[0]) + range_string = unicode_utils.ucs_symbol(code_point_range[0]) else: - range_string = ucs_symbol_range( + range_string = unicode_utils.ucs_symbol_range( code_point_range[0], code_point_range[-1]) if len(line+range_string) > max_column: i18n_file.write(line+'/\n') @@ -441,15 +105,15 @@ def output_charmap(i18n_file, map_name, map_function): line = prefix map_string = '' i18n_file.write('%s /\n' %map_name) - for code_point in sorted(UNICODE_ATTRIBUTES): + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): mapped = map_function(code_point) if code_point != mapped: if line.strip(): line += ';' map_string = '(' \ - + ucs_symbol(code_point) \ + + unicode_utils.ucs_symbol(code_point) \ + ',' \ - + ucs_symbol(mapped) \ + + unicode_utils.ucs_symbol(mapped) \ + ')' if len(line+map_string) > max_column: i18n_file.write(line+'/\n') @@ -459,110 +123,6 @@ def output_charmap(i18n_file, map_name, map_function): i18n_file.write(line+'\n') i18n_file.write('\n') -def verifications(): - '''Tests whether the is_* functions observe the known restrictions''' - for code_point in sorted(UNICODE_ATTRIBUTES): - # toupper restriction: "Only characters specified for the keywords - # lower and upper shall be specified. - if (to_upper(code_point) != code_point - and not (is_lower(code_point) or is_upper(code_point))): - sys.stderr.write( - ('%(sym)s is not upper|lower ' - + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{ - 'sym': ucs_symbol(code_point), - 'c': code_point, - 'uc': to_upper(code_point)}) - # tolower restriction: "Only characters specified for the keywords - # lower and upper shall be specified. - if (to_lower(code_point) != code_point - and not (is_lower(code_point) or is_upper(code_point))): - sys.stderr.write( - ('%(sym)s is not upper|lower ' - + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{ - 'sym': ucs_symbol(code_point), - 'c': code_point, - 'uc': to_lower(code_point)}) - # alpha restriction: "Characters classified as either upper or lower - # shall automatically belong to this class. - if ((is_lower(code_point) or is_upper(code_point)) - and not is_alpha(code_point)): - sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{ - 'sym': ucs_symbol(code_point)}) - # alpha restriction: “No character specified for the keywords cntrl, - # digit, punct or space shall be specified.” - if (is_alpha(code_point) and is_cntrl(code_point)): - sys.stderr.write('%(sym)s is alpha and cntrl\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_alpha(code_point) and is_digit(code_point)): - sys.stderr.write('%(sym)s is alpha and digit\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_alpha(code_point) and is_punct(code_point)): - sys.stderr.write('%(sym)s is alpha and punct\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_alpha(code_point) and is_space(code_point)): - sys.stderr.write('%(sym)s is alpha and space\n' %{ - 'sym': ucs_symbol(code_point)}) - # space restriction: “No character specified for the keywords upper, - # lower, alpha, digit, graph or xdigit shall be specified.” - # upper, lower, alpha already checked above. - if (is_space(code_point) and is_digit(code_point)): - sys.stderr.write('%(sym)s is space and digit\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_space(code_point) and is_graph(code_point)): - sys.stderr.write('%(sym)s is space and graph\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_space(code_point) and is_xdigit(code_point)): - sys.stderr.write('%(sym)s is space and xdigit\n' %{ - 'sym': ucs_symbol(code_point)}) - # cntrl restriction: “No character specified for the keywords upper, - # lower, alpha, digit, punct, graph, print or xdigit shall be - # specified.” upper, lower, alpha already checked above. - if (is_cntrl(code_point) and is_digit(code_point)): - sys.stderr.write('%(sym)s is cntrl and digit\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_cntrl(code_point) and is_punct(code_point)): - sys.stderr.write('%(sym)s is cntrl and punct\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_cntrl(code_point) and is_graph(code_point)): - sys.stderr.write('%(sym)s is cntrl and graph\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_cntrl(code_point) and is_print(code_point)): - sys.stderr.write('%(sym)s is cntrl and print\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_cntrl(code_point) and is_xdigit(code_point)): - sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{ - 'sym': ucs_symbol(code_point)}) - # punct restriction: “No character specified for the keywords upper, - # lower, alpha, digit, cntrl, xdigit or as the character shall - # be specified.” upper, lower, alpha, cntrl already checked above. - if (is_punct(code_point) and is_digit(code_point)): - sys.stderr.write('%(sym)s is punct and digit\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_punct(code_point) and is_xdigit(code_point)): - sys.stderr.write('%(sym)s is punct and xdigit\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_punct(code_point) and code_point == 0x0020): - sys.stderr.write('%(sym)s is punct\n' %{ - 'sym': ucs_symbol(code_point)}) - # graph restriction: “No character specified for the keyword cntrl - # shall be specified.” Already checked above. - - # print restriction: “No character specified for the keyword cntrl - # shall be specified.” Already checked above. - - # graph - print relation: differ only in the character. - # How is this possible if there are more than one space character?! - # I think susv2/xbd/locale.html should speak of “space characters”, - # not “space character”. - if (is_print(code_point) - and not (is_graph(code_point) or is_space(code_point))): - sys.stderr.write('%(sym)s is print but not graph|\n' %{ - 'sym': ucs_symbol(code_point)}) - if (not is_print(code_point) - and (is_graph(code_point) or code_point == 0x0020)): - sys.stderr.write('%(sym)s is graph| but not print\n' %{ - 'sym': ucs_symbol(code_point)}) - def read_input_file(filename): '''Reads the original glibc i18n file to get the original head and tail. @@ -648,18 +208,18 @@ def output_tables(i18n_file, unicode_version): + 'program.\n\n') i18n_file.write('% The "upper" class reflects the uppercase ' + 'characters of class "alpha"\n') - output_charclass(i18n_file, 'upper', is_upper) + output_charclass(i18n_file, 'upper', unicode_utils.is_upper) i18n_file.write('% The "lower" class reflects the lowercase ' + 'characters of class "alpha"\n') - output_charclass(i18n_file, 'lower', is_lower) + output_charclass(i18n_file, 'lower', unicode_utils.is_lower) i18n_file.write('% The "alpha" class of the "i18n" FDCC-set is ' + 'reflecting\n') i18n_file.write('% the recommendations in TR 10176 annex A\n') - output_charclass(i18n_file, 'alpha', is_alpha) + output_charclass(i18n_file, 'alpha', unicode_utils.is_alpha) i18n_file.write('% The "digit" class must only contain the ' + 'BASIC LATIN digits, says ISO C 99\n') i18n_file.write('% (sections 7.25.2.1.5 and 5.2.1).\n') - output_charclass(i18n_file, 'digit', is_digit) + output_charclass(i18n_file, 'digit', unicode_utils.is_digit) i18n_file.write('% The "outdigit" information is by default ' + '"0" to "9". We don\'t have to\n') i18n_file.write('% provide it here since localedef will fill ' @@ -669,29 +229,30 @@ def output_tables(i18n_file, unicode_version): i18n_file.write('% outdigit /\n') i18n_file.write('% ..\n\n') # output_charclass(i18n_file, 'outdigit', is_outdigit) - output_charclass(i18n_file, 'space', is_space) - output_charclass(i18n_file, 'cntrl', is_cntrl) - output_charclass(i18n_file, 'punct', is_punct) - output_charclass(i18n_file, 'graph', is_graph) - output_charclass(i18n_file, 'print', is_print) + output_charclass(i18n_file, 'space', unicode_utils.is_space) + output_charclass(i18n_file, 'cntrl', unicode_utils.is_cntrl) + output_charclass(i18n_file, 'punct', unicode_utils.is_punct) + output_charclass(i18n_file, 'graph', unicode_utils.is_graph) + output_charclass(i18n_file, 'print', unicode_utils.is_print) i18n_file.write('% The "xdigit" class must only contain the ' + 'BASIC LATIN digits and A-F, a-f,\n') i18n_file.write('% says ISO C 99 ' + '(sections 7.25.2.1.12 and 6.4.4.1).\n') - output_charclass(i18n_file, 'xdigit', is_xdigit) - output_charclass(i18n_file, 'blank', is_blank) - output_charmap(i18n_file, 'toupper', to_upper) - output_charmap(i18n_file, 'tolower', to_lower) - output_charmap(i18n_file, 'map "totitle";', to_title) + output_charclass(i18n_file, 'xdigit', unicode_utils.is_xdigit) + output_charclass(i18n_file, 'blank', unicode_utils.is_blank) + output_charmap(i18n_file, 'toupper', unicode_utils.to_upper) + output_charmap(i18n_file, 'tolower', unicode_utils.to_lower) + output_charmap(i18n_file, 'map "totitle";', unicode_utils.to_title) i18n_file.write('% The "combining" class reflects ISO/IEC 10646-1 ' + 'annex B.1\n') i18n_file.write('% That is, all combining characters (level 2+3).\n') - output_charclass(i18n_file, 'class "combining";', is_combining) + output_charclass(i18n_file, 'class "combining";', + unicode_utils.is_combining) i18n_file.write('% The "combining_level3" class reflects ' + 'ISO/IEC 10646-1 annex B.2\n') i18n_file.write('% That is, combining characters of level 3.\n') - output_charclass(i18n_file, - 'class "combining_level3";', is_combining_level3) + output_charclass(i18n_file, 'class "combining_level3";', + unicode_utils.is_combining_level3) if __name__ == "__main__": PARSER = argparse.ArgumentParser( @@ -739,9 +300,11 @@ if __name__ == "__main__": help='The Unicode version of the input files used.') ARGS = PARSER.parse_args() - fill_attributes(ARGS.unicode_data_file) - fill_derived_core_properties(ARGS.derived_core_properties_file) - verifications() + unicode_utils.fill_attributes( + ARGS.unicode_data_file) + unicode_utils.fill_derived_core_properties( + ARGS.derived_core_properties_file) + unicode_utils.verifications() HEAD = TAIL = '' if ARGS.input_file: (HEAD, TAIL) = read_input_file(ARGS.input_file) diff --git a/localedata/unicode-gen/unicode_utils.py b/localedata/unicode-gen/unicode_utils.py new file mode 100644 index 0000000..ee91582 --- /dev/null +++ b/localedata/unicode-gen/unicode_utils.py @@ -0,0 +1,502 @@ +# Utilities to generate Unicode data for glibc from upstream Unicode data. +# +# Copyright (C) 2014, 2015 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# . + +''' +This module contains utilities used by the scripts to generate +Unicode data for glibc from upstream Unicode data files. +''' + +import sys +import re + +# Dictionary holding the entire contents of the UnicodeData.txt file +# +# Contents of this dictionary look like this: +# +# {0: {'category': 'Cc', +# 'title': None, +# 'digit': '', +# 'name': '', +# 'bidi': 'BN', +# 'combining': '0', +# 'comment': '', +# 'oldname': 'NULL', +# 'decomposition': '', +# 'upper': None, +# 'mirrored': 'N', +# 'lower': None, +# 'decdigit': '', +# 'numeric': ''}, +# … +# } +UNICODE_ATTRIBUTES = {} + +# Dictionary holding the entire contents of the DerivedCoreProperties.txt file +# +# Contents of this dictionary look like this: +# +# {917504: ['Default_Ignorable_Code_Point'], +# 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'], +# … +# } +DERIVED_CORE_PROPERTIES = {} + +# Dictionary holding the entire contents of the EastAsianWidths.txt file +# +# Contents of this dictionary look like this: +# +# {0: 'N', … , 45430: 'W', …} +EAST_ASIAN_WIDTHS = {} + +def fill_attribute(code_point, fields): + '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields. + + One entry in the UNICODE_ATTRIBUTES dictionary represents one line + in the UnicodeData.txt file. + + ''' + UNICODE_ATTRIBUTES[code_point] = { + 'name': fields[1], # Character name + 'category': fields[2], # General category + 'combining': fields[3], # Canonical combining classes + 'bidi': fields[4], # Bidirectional category + 'decomposition': fields[5], # Character decomposition mapping + 'decdigit': fields[6], # Decimal digit value + 'digit': fields[7], # Digit value + 'numeric': fields[8], # Numeric value + 'mirrored': fields[9], # mirrored + 'oldname': fields[10], # Old Unicode 1.0 name + 'comment': fields[11], # comment + # Uppercase mapping + 'upper': int(fields[12], 16) if fields[12] else None, + # Lowercase mapping + 'lower': int(fields[13], 16) if fields[13] else None, + # Titlecase mapping + 'title': int(fields[14], 16) if fields[14] else None, + } + +def fill_attributes(filename): + '''Stores the entire contents of the UnicodeData.txt file + in the UNICODE_ATTRIBUTES dictionary. + + A typical line for a single code point in UnicodeData.txt looks + like this: + + 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; + + Code point ranges are indicated by pairs of lines like this: + + 4E00;;Lo;0;L;;;;;N;;;;; + 9FCC;;Lo;0;L;;;;;N;;;;; + ''' + with open(filename, mode='r') as unicode_data_file: + fields_start = [] + for line in unicode_data_file: + fields = line.strip().split(';') + if len(fields) != 15: + sys.stderr.write( + 'short line in file "%(f)s": %(l)s\n' %{ + 'f': filename, 'l': line}) + exit(1) + if fields[2] == 'Cs': + # Surrogates are UTF-16 artefacts, + # not real characters. Ignore them. + fields_start = [] + continue + if fields[1].endswith(', First>'): + fields_start = fields + fields_start[1] = fields_start[1].split(',')[0][1:] + continue + if fields[1].endswith(', Last>'): + fields[1] = fields[1].split(',')[0][1:] + if fields[1:] != fields_start[1:]: + sys.stderr.write( + 'broken code point range in file "%(f)s": %(l)s\n' %{ + 'f': filename, 'l': line}) + exit(1) + for code_point in range( + int(fields_start[0], 16), + int(fields[0], 16)+1): + fill_attribute(code_point, fields) + fields_start = [] + continue + fill_attribute(int(fields[0], 16), fields) + fields_start = [] + +def fill_derived_core_properties(filename): + '''Stores the entire contents of the DerivedCoreProperties.txt file + in the DERIVED_CORE_PROPERTIES dictionary. + + Lines in DerivedCoreProperties.txt are either a code point range like + this: + + 0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z + + or a single code point like this: + + 00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR + + ''' + with open(filename, mode='r') as derived_core_properties_file: + for line in derived_core_properties_file: + match = re.match( + r'^(?P[0-9A-F]{4,6})' + + r'(?:\.\.(?P[0-9A-F]{4,6}))?' + + r'\s*;\s*(?P[a-zA-Z_]+)', + line) + if not match: + continue + start = match.group('codepoint1') + end = match.group('codepoint2') + if not end: + end = start + for code_point in range(int(start, 16), int(end, 16)+1): + prop = match.group('property') + if code_point in DERIVED_CORE_PROPERTIES: + DERIVED_CORE_PROPERTIES[code_point].append(prop) + else: + DERIVED_CORE_PROPERTIES[code_point] = [prop] + +def fill_east_asian_widths(filename): + '''Stores the entire contents of the EastAsianWidths.txt file + in the EAST_ASIAN_WIDTHS dictionary. + + Lines in EastAsianWidths.txt are either a code point range like + this: + + 9FCD..9FFF;W # Cn [51] .. + + or a single code point like this: + + A015;W # Lm YI SYLLABLE WU + ''' + with open(filename, mode='r') as east_asian_widths_file: + for line in east_asian_widths_file: + match = re.match( + r'^(?P[0-9A-F]{4,6})' + +r'(?:\.\.(?P[0-9A-F]{4,6}))?' + +r'\s*;\s*(?P[a-zA-Z]+)', + line) + if not match: + continue + start = match.group('codepoint1') + end = match.group('codepoint2') + if not end: + end = start + for code_point in range(int(start, 16), int(end, 16)+1): + EAST_ASIAN_WIDTHS[code_point] = match.group('property') + +def to_upper(code_point): + '''Returns the code point of the uppercase version + of the given code point''' + if (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['upper']): + return UNICODE_ATTRIBUTES[code_point]['upper'] + else: + return code_point + +def to_lower(code_point): + '''Returns the code point of the lowercase version + of the given code point''' + if (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['lower']): + return UNICODE_ATTRIBUTES[code_point]['lower'] + else: + return code_point + +def to_title(code_point): + '''Returns the code point of the titlecase version + of the given code point''' + if (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['title']): + return UNICODE_ATTRIBUTES[code_point]['title'] + else: + return code_point + +def is_upper(code_point): + '''Checks whether the character with this code point is uppercase''' + return (to_lower(code_point) != code_point + or (code_point in DERIVED_CORE_PROPERTIES + and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point])) + +def is_lower(code_point): + '''Checks whether the character with this code point is lowercase''' + # Some characters are defined as “Lowercase” in + # DerivedCoreProperties.txt but do not have a mapping to upper + # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is + # one of these. + return (to_upper(code_point) != code_point + # is lowercase, but without simple to_upper mapping. + or code_point == 0x00DF + or (code_point in DERIVED_CORE_PROPERTIES + and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point])) + +def is_alpha(code_point): + '''Checks whether the character with this code point is alphabetic''' + return ((code_point in DERIVED_CORE_PROPERTIES + and + 'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point]) + or + # Consider all the non-ASCII digits as alphabetic. + # ISO C 99 forbids us to have them in category “digit”, + # but we want iswalnum to return true on them. + (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd' + and not (code_point >= 0x0030 and code_point <= 0x0039))) + +def is_digit(code_point): + '''Checks whether the character with this code point is a digit''' + if False: + return (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd') + # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without + # a zero. Must add <0> in front of them by hand. + else: + # SUSV2 gives us some freedom for the "digit" category, but ISO C 99 + # takes it away: + # 7.25.2.1.5: + # The iswdigit function tests for any wide character that + # corresponds to a decimal-digit character (as defined in 5.2.1). + # 5.2.1: + # the 10 decimal digits 0 1 2 3 4 5 6 7 8 9 + return (code_point >= 0x0030 and code_point <= 0x0039) + +def is_outdigit(code_point): + '''Checks whether the character with this code point is outdigit''' + return (code_point >= 0x0030 and code_point <= 0x0039) + +def is_blank(code_point): + '''Checks whether the character with this code point is blank''' + return (code_point == 0x0009 # '\t' + # Category Zs without mention of '' + or (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs' + and '' not in + UNICODE_ATTRIBUTES[code_point]['decomposition'])) + +def is_space(code_point): + '''Checks whether the character with this code point is a space''' + # Don’t make U+00A0 a space. Non-breaking space means that all programs + # should treat it like a punctuation character, not like a space. + return (code_point == 0x0020 # ' ' + or code_point == 0x000C # '\f' + or code_point == 0x000A # '\n' + or code_point == 0x000D # '\r' + or code_point == 0x0009 # '\t' + or code_point == 0x000B # '\v' + # Categories Zl, Zp, and Zs without mention of "" + or (UNICODE_ATTRIBUTES[code_point]['name'] + and + (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp'] + or + (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs'] + and + '' not in + UNICODE_ATTRIBUTES[code_point]['decomposition'])))) + +def is_cntrl(code_point): + '''Checks whether the character with this code point is + a control character''' + return (UNICODE_ATTRIBUTES[code_point]['name'] + and (UNICODE_ATTRIBUTES[code_point]['name'] == '' + or + UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp'])) + +def is_xdigit(code_point): + '''Checks whether the character with this code point is + a hexadecimal digit''' + if False: + return (is_digit(code_point) + or (code_point >= 0x0041 and code_point <= 0x0046) + or (code_point >= 0x0061 and code_point <= 0x0066)) + else: + # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99 + # takes it away: + # 7.25.2.1.12: + # The iswxdigit function tests for any wide character that + # corresponds to a hexadecimal-digit character (as defined + # in 6.4.4.1). + # 6.4.4.1: + # hexadecimal-digit: one of + # 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F + return ((code_point >= 0x0030 and code_point <= 0x0039) + or (code_point >= 0x0041 and code_point <= 0x0046) + or (code_point >= 0x0061 and code_point <= 0x0066)) + +def is_graph(code_point): + '''Checks whether the character with this code point is + a graphical character''' + return (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['name'] != '' + and not is_space(code_point)) + +def is_print(code_point): + '''Checks whether the character with this code point is printable''' + return (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['name'] != '' + and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp']) + +def is_punct(code_point): + '''Checks whether the character with this code point is punctuation''' + if False: + return (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P')) + else: + # The traditional POSIX definition of punctuation is every graphic, + # non-alphanumeric character. + return (is_graph(code_point) + and not is_alpha(code_point) + and not is_digit(code_point)) + +def is_combining(code_point): + '''Checks whether the character with this code point is + a combining character''' + # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt + # file. In 3.0.1 it was identical to the union of the general categories + # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the + # PropList.txt file, so we take the latter definition. + return (UNICODE_ATTRIBUTES[code_point]['name'] + and + UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me']) + +def is_combining_level3(code_point): + '''Checks whether the character with this code point is + a combining level3 character''' + return (is_combining(code_point) + and + int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200)) + +def ucs_symbol(code_point): + '''Return the UCS symbol string for a Unicode character.''' + if code_point < 0x10000: + return ''.format(code_point) + else: + return ''.format(code_point) + +def ucs_symbol_range(code_point_low, code_point_high): + '''Returns a string UCS symbol string for a code point range. + + Example: + + .. + ''' + return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high) + +def verifications(): + '''Tests whether the is_* functions observe the known restrictions''' + for code_point in sorted(UNICODE_ATTRIBUTES): + # toupper restriction: "Only characters specified for the keywords + # lower and upper shall be specified. + if (to_upper(code_point) != code_point + and not (is_lower(code_point) or is_upper(code_point))): + sys.stderr.write( + ('%(sym)s is not upper|lower ' + + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{ + 'sym': ucs_symbol(code_point), + 'c': code_point, + 'uc': to_upper(code_point)}) + # tolower restriction: "Only characters specified for the keywords + # lower and upper shall be specified. + if (to_lower(code_point) != code_point + and not (is_lower(code_point) or is_upper(code_point))): + sys.stderr.write( + ('%(sym)s is not upper|lower ' + + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{ + 'sym': ucs_symbol(code_point), + 'c': code_point, + 'uc': to_lower(code_point)}) + # alpha restriction: "Characters classified as either upper or lower + # shall automatically belong to this class. + if ((is_lower(code_point) or is_upper(code_point)) + and not is_alpha(code_point)): + sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{ + 'sym': ucs_symbol(code_point)}) + # alpha restriction: “No character specified for the keywords cntrl, + # digit, punct or space shall be specified.” + if (is_alpha(code_point) and is_cntrl(code_point)): + sys.stderr.write('%(sym)s is alpha and cntrl\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_alpha(code_point) and is_digit(code_point)): + sys.stderr.write('%(sym)s is alpha and digit\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_alpha(code_point) and is_punct(code_point)): + sys.stderr.write('%(sym)s is alpha and punct\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_alpha(code_point) and is_space(code_point)): + sys.stderr.write('%(sym)s is alpha and space\n' %{ + 'sym': ucs_symbol(code_point)}) + # space restriction: “No character specified for the keywords upper, + # lower, alpha, digit, graph or xdigit shall be specified.” + # upper, lower, alpha already checked above. + if (is_space(code_point) and is_digit(code_point)): + sys.stderr.write('%(sym)s is space and digit\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_space(code_point) and is_graph(code_point)): + sys.stderr.write('%(sym)s is space and graph\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_space(code_point) and is_xdigit(code_point)): + sys.stderr.write('%(sym)s is space and xdigit\n' %{ + 'sym': ucs_symbol(code_point)}) + # cntrl restriction: “No character specified for the keywords upper, + # lower, alpha, digit, punct, graph, print or xdigit shall be + # specified.” upper, lower, alpha already checked above. + if (is_cntrl(code_point) and is_digit(code_point)): + sys.stderr.write('%(sym)s is cntrl and digit\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_cntrl(code_point) and is_punct(code_point)): + sys.stderr.write('%(sym)s is cntrl and punct\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_cntrl(code_point) and is_graph(code_point)): + sys.stderr.write('%(sym)s is cntrl and graph\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_cntrl(code_point) and is_print(code_point)): + sys.stderr.write('%(sym)s is cntrl and print\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_cntrl(code_point) and is_xdigit(code_point)): + sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{ + 'sym': ucs_symbol(code_point)}) + # punct restriction: “No character specified for the keywords upper, + # lower, alpha, digit, cntrl, xdigit or as the character shall + # be specified.” upper, lower, alpha, cntrl already checked above. + if (is_punct(code_point) and is_digit(code_point)): + sys.stderr.write('%(sym)s is punct and digit\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_punct(code_point) and is_xdigit(code_point)): + sys.stderr.write('%(sym)s is punct and xdigit\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_punct(code_point) and code_point == 0x0020): + sys.stderr.write('%(sym)s is punct\n' %{ + 'sym': ucs_symbol(code_point)}) + # graph restriction: “No character specified for the keyword cntrl + # shall be specified.” Already checked above. + + # print restriction: “No character specified for the keyword cntrl + # shall be specified.” Already checked above. + + # graph - print relation: differ only in the character. + # How is this possible if there are more than one space character?! + # I think susv2/xbd/locale.html should speak of “space characters”, + # not “space character”. + if (is_print(code_point) + and not (is_graph(code_point) or is_space(code_point))): + sys.stderr.write('%(sym)s is print but not graph|\n' %{ + 'sym': unicode_utils.ucs_symbol(code_point)}) + if (not is_print(code_point) + and (is_graph(code_point) or code_point == 0x0020)): + sys.stderr.write('%(sym)s is graph| but not print\n' %{ + 'sym': unicode_utils.ucs_symbol(code_point)}) diff --git a/localedata/unicode-gen/utf8_compatibility.py b/localedata/unicode-gen/utf8_compatibility.py index b84a1eb..3b7a94c 100755 --- a/localedata/unicode-gen/utf8_compatibility.py +++ b/localedata/unicode-gen/utf8_compatibility.py @@ -30,146 +30,7 @@ To see how this script is used, call it with the “-h” option: import sys import re import argparse - -# Dictionary holding the entire contents of the UnicodeData.txt file -# -# Contents of this dictionary look like this: -# -# {0: {'category': 'Cc', -# 'title': None, -# 'digit': '', -# 'name': '', -# 'bidi': 'BN', -# 'combining': '0', -# 'comment': '', -# 'oldname': 'NULL', -# 'decomposition': '', -# 'upper': None, -# 'mirrored': 'N', -# 'lower': None, -# 'decdigit': '', -# 'numeric': ''}, -# … -# } -UNICODE_ATTRIBUTES = {} - -# Dictionary holding the entire contents of the EastAsianWidths.txt file -# -# Contents of this dictionary look like this: -# -# {0: 'N', … , 45430: 'W', …} -EAST_ASIAN_WIDTHS = {} - -def fill_attribute(code_point, fields): - '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields. - - One entry in the UNICODE_ATTRIBUTES dictionary represents one line - in the UnicodeData.txt file. - - ''' - UNICODE_ATTRIBUTES[code_point] = { - 'name': fields[1], # Character name - 'category': fields[2], # General category - 'combining': fields[3], # Canonical combining classes - 'bidi': fields[4], # Bidirectional category - 'decomposition': fields[5], # Character decomposition mapping - 'decdigit': fields[6], # Decimal digit value - 'digit': fields[7], # Digit value - 'numeric': fields[8], # Numeric value - 'mirrored': fields[9], # mirrored - 'oldname': fields[10], # Old Unicode 1.0 name - 'comment': fields[11], # comment - # Uppercase mapping - 'upper': int(fields[12], 16) if fields[12] else None, - # Lowercase mapping - 'lower': int(fields[13], 16) if fields[13] else None, - # Titlecase mapping - 'title': int(fields[14], 16) if fields[14] else None, - } - -def fill_attributes(filename): - '''Stores the entire contents of the UnicodeData.txt file - in the UNICODE_ATTRIBUTES dictionary. - - A typical line for a single code point in UnicodeData.txt looks - like this: - - 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; - - Code point ranges are indicated by pairs of lines like this: - - 4E00;;Lo;0;L;;;;;N;;;;; - 9FCC;;Lo;0;L;;;;;N;;;;; - ''' - with open(filename, mode='r') as unicode_data_file: - fields_start = [] - for line in unicode_data_file: - fields = line.strip().split(';') - if len(fields) != 15: - sys.stderr.write( - 'short line in file "%(f)s": %(l)s\n' %{ - 'f': filename, 'l': line}) - exit(1) - if fields[2] == 'Cs': - # Surrogates are UTF-16 artefacts, - # not real characters. Ignore them. - fields_start = [] - continue - if fields[1].endswith(', First>'): - fields_start = fields - fields_start[1] = fields_start[1].split(',')[0][1:] - continue - if fields[1].endswith(', Last>'): - fields[1] = fields[1].split(',')[0][1:] - if fields[1:] != fields_start[1:]: - sys.stderr.write( - 'broken code point range in file "%(f)s": %(l)s\n' %{ - 'f': filename, 'l': line}) - exit(1) - for code_point in range( - int(fields_start[0], 16), - int(fields[0], 16)+1): - fill_attribute(code_point, fields) - fields_start = [] - continue - fill_attribute(int(fields[0], 16), fields) - fields_start = [] - -def fill_east_asian_widths(filename): - '''Stores the entire contents of the EastAsianWidths.txt file - in the EAST_ASIAN_WIDTHS dictionary. - - Lines in EastAsianWidths.txt are either a code point range like - this: - - 9FCD..9FFF;W # Cn [51] .. - - or a single code point like this: - - A015;W # Lm YI SYLLABLE WU - ''' - with open(filename, mode='r') as east_asian_widths_file: - for line in east_asian_widths_file: - match = re.match( - r'^(?P[0-9A-F]{4,6})' - +r'(?:\.\.(?P[0-9A-F]{4,6}))?' - +r'\s*;\s*(?P[a-zA-Z]+)', - line) - if not match: - continue - start = match.group('codepoint1') - end = match.group('codepoint2') - if not end: - end = start - for code_point in range(int(start, 16), int(end, 16)+1): - EAST_ASIAN_WIDTHS[code_point] = match.group('property') - -def ucs_symbol(code_point): - '''Return the UCS symbol string for a Unicode character.''' - if code_point < 0x10000: - return ''.format(code_point) - else: - return ''.format(code_point) +import unicode_utils def create_charmap_dictionary(file_name): '''Create a dictionary for all code points found in the CHARMAP @@ -217,10 +78,10 @@ def check_charmap(original_file_name, new_file_name): if ARGS.show_missing_characters: for key in sorted(set(ocharmap)-set(ncharmap)): print('removed: {:s} {:s} {:s}'.format( - ucs_symbol(key), + unicode_utils.ucs_symbol(key), ocharmap[key], - UNICODE_ATTRIBUTES[key]['name'] \ - if key in UNICODE_ATTRIBUTES else None)) + unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \ + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) print('------------------------------------------------------------') changed_charmap = {} for key in set(ocharmap).intersection(set(ncharmap)): @@ -231,21 +92,21 @@ def check_charmap(original_file_name, new_file_name): if ARGS.show_changed_characters: for key in sorted(changed_charmap): print('changed: {:s} {:s}->{:s} {:s}'.format( - ucs_symbol(key), + unicode_utils.ucs_symbol(key), changed_charmap[key][0], changed_charmap[key][1], - UNICODE_ATTRIBUTES[key]['name'] \ - if key in UNICODE_ATTRIBUTES else None)) + unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \ + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) print('------------------------------------------------------------') print('Total added characters in newly generated CHARMAP: %d' %len(set(ncharmap)-set(ocharmap))) if ARGS.show_added_characters: for key in sorted(set(ncharmap)-set(ocharmap)): print('added: {:s} {:s} {:s}'.format( - ucs_symbol(key), + unicode_utils.ucs_symbol(key), ncharmap[key], - UNICODE_ATTRIBUTES[key]['name'] \ - if key in UNICODE_ATTRIBUTES else None)) + unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \ + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) def create_width_dictionary(file_name): '''Create a dictionary for all code points found in the WIDTH @@ -290,20 +151,20 @@ def check_width(original_file_name, new_file_name): + 'i.e. these have width 1 now.)') if ARGS.show_missing_characters: for key in sorted(set(owidth)-set(nwidth)): - print('removed: {:s} '.format(ucs_symbol(key)) + print('removed: {:s} '.format(unicode_utils.ucs_symbol(key)) + '{:d} : '.format(owidth[key]) + 'eaw={:s} '.format( - EAST_ASIAN_WIDTHS[key] - if key in EAST_ASIAN_WIDTHS else None) + unicode_utils.EAST_ASIAN_WIDTHS[key] + if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None') + 'category={:2s} '.format( - UNICODE_ATTRIBUTES[key]['category'] - if key in UNICODE_ATTRIBUTES else None) + unicode_utils.UNICODE_ATTRIBUTES[key]['category'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'bidi={:3s} '.format( - UNICODE_ATTRIBUTES[key]['bidi'] - if key in UNICODE_ATTRIBUTES else None) + unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'name={:s}'.format( - UNICODE_ATTRIBUTES[key]['name'] - if key in UNICODE_ATTRIBUTES else None)) + unicode_utils.UNICODE_ATTRIBUTES[key]['name'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) print('------------------------------------------------------------') changed_width = {} for key in set(owidth).intersection(set(nwidth)): @@ -313,21 +174,21 @@ def check_width(original_file_name, new_file_name): %len(changed_width)) if ARGS.show_changed_characters: for key in sorted(changed_width): - print('changed width: {:s} '.format(ucs_symbol(key)) + print('changed width: {:s} '.format(unicode_utils.ucs_symbol(key)) + '{:d}->{:d} : '.format(changed_width[key][0], changed_width[key][1]) + 'eaw={:s} '.format( - EAST_ASIAN_WIDTHS[key] - if key in EAST_ASIAN_WIDTHS else None) + unicode_utils.EAST_ASIAN_WIDTHS[key] + if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None') + 'category={:2s} '.format( - UNICODE_ATTRIBUTES[key]['category'] - if key in UNICODE_ATTRIBUTES else None) + unicode_utils.UNICODE_ATTRIBUTES[key]['category'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'bidi={:3s} '.format( - UNICODE_ATTRIBUTES[key]['bidi'] - if key in UNICODE_ATTRIBUTES else None) + unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'name={:s}'.format( - UNICODE_ATTRIBUTES[key]['name'] - if key in UNICODE_ATTRIBUTES else None)) + unicode_utils.UNICODE_ATTRIBUTES[key]['name'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) print('------------------------------------------------------------') print('Total added characters in newly generated WIDTH: %d' %len(set(nwidth)-set(owidth))) @@ -335,20 +196,20 @@ def check_width(original_file_name, new_file_name): + 'i.e. these had width 1 before.)') if ARGS.show_added_characters: for key in sorted(set(nwidth)-set(owidth)): - print('added: {:s} '.format(ucs_symbol(key)) + print('added: {:s} '.format(unicode_utils.ucs_symbol(key)) + '{:d} : '.format(nwidth[key]) + 'eaw={:s} '.format( - EAST_ASIAN_WIDTHS[key] - if key in EAST_ASIAN_WIDTHS else None) + unicode_utils.EAST_ASIAN_WIDTHS[key] + if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None') + 'category={:2s} '.format( - UNICODE_ATTRIBUTES[key]['category'] - if key in UNICODE_ATTRIBUTES else None) + unicode_utils.UNICODE_ATTRIBUTES[key]['category'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'bidi={:3s} '.format( - UNICODE_ATTRIBUTES[key]['bidi'] - if key in UNICODE_ATTRIBUTES else None) + unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'name={:s}'.format( - UNICODE_ATTRIBUTES[key]['name'] - if key in UNICODE_ATTRIBUTES else None)) + unicode_utils.UNICODE_ATTRIBUTES[key]['name'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) if __name__ == "__main__": PARSER = argparse.ArgumentParser( @@ -392,8 +253,8 @@ if __name__ == "__main__": ARGS = PARSER.parse_args() if ARGS.unicode_data_file: - fill_attributes(ARGS.unicode_data_file) + unicode_utils.fill_attributes(ARGS.unicode_data_file) if ARGS.east_asian_width_file: - fill_east_asian_widths(ARGS.east_asian_width_file) + unicode_utils.fill_east_asian_widths(ARGS.east_asian_width_file) check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file) check_width(ARGS.old_utf8_file, ARGS.new_utf8_file) diff --git a/localedata/unicode-gen/utf8_gen.py b/localedata/unicode-gen/utf8_gen.py index f1b88f5..bc84c07 100755 --- a/localedata/unicode-gen/utf8_gen.py +++ b/localedata/unicode-gen/utf8_gen.py @@ -29,6 +29,7 @@ It will output UTF-8 file import sys import re +import unicode_utils # Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book, # sections 3.11 and 4.4. @@ -49,13 +50,6 @@ JAMO_FINAL_SHORT_NAME = ( 'P', 'H' ) -def ucs_symbol(code_point): - '''Return the UCS symbol string for a Unicode character.''' - if code_point < 0x10000: - return ''.format(code_point) - else: - return ''.format(code_point) - def process_range(start, end, outfile, name): '''Writes a range of code points into the CHARMAP section of the output file @@ -78,7 +72,7 @@ def process_range(start, end, outfile, name): + JAMO_MEDIAL_SHORT_NAME[index2] \ + JAMO_FINAL_SHORT_NAME[index3] outfile.write('{:<11s} {:<12s} {:s}\n'.format( - ucs_symbol(i), convert_to_hex(i), + unicode_utils.ucs_symbol(i), convert_to_hex(i), hangul_syllable_name)) return # UnicodeData.txt file has contains code point ranges like this: @@ -95,14 +89,14 @@ def process_range(start, end, outfile, name): for i in range(int(start, 16), int(end, 16), 64 ): if i > (int(end, 16)-64): outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format( - ucs_symbol(i), - ucs_symbol(int(end,16)), + unicode_utils.ucs_symbol(i), + unicode_utils.ucs_symbol(int(end,16)), convert_to_hex(i), name)) break outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format( - ucs_symbol(i), - ucs_symbol(i+63), + unicode_utils.ucs_symbol(i), + unicode_utils.ucs_symbol(i+63), convert_to_hex(i), name)) @@ -168,7 +162,7 @@ def process_charmap(flines, outfile): # comments, so we keep these comment lines. outfile.write('%') outfile.write('{:<11s} {:<12s} {:s}\n'.format( - ucs_symbol(int(fields[0], 16)), + unicode_utils.ucs_symbol(int(fields[0], 16)), convert_to_hex(int(fields[0], 16)), fields[1])) @@ -230,7 +224,7 @@ def process_width(outfile, ulines, elines): for line in ulines: fields = line.split(";") if fields[4] == "NSM" or fields[2] == "Cf": - width_dict[int(fields[0], 16)] = ucs_symbol( + width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol( int(fields[0], 16)) + '\t0' for line in elines: @@ -238,7 +232,7 @@ def process_width(outfile, ulines, elines): # UnicodeData.txt: fields = line.split(";") if not '..' in fields[0]: - width_dict[int(fields[0], 16)] = ucs_symbol( + width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol( int(fields[0], 16)) + '\t2' else: code_points = fields[0].split("..") @@ -247,8 +241,8 @@ def process_width(outfile, ulines, elines): if key in width_dict: del width_dict[key] width_dict[int(code_points[0], 16)] = '{:s}...{:s}\t2'.format( - ucs_symbol(int(code_points[0], 16)), - ucs_symbol(int(code_points[1], 16))) + unicode_utils.ucs_symbol(int(code_points[0], 16)), + unicode_utils.ucs_symbol(int(code_points[1], 16))) for key in sorted(width_dict): outfile.write(width_dict[key]+'\n') -- 2.4.3