Some more additions to the translit_neutral file by Marko Myllynen

(and pylint warning fixes to the gen_translit_* scripts by Pravin Satpute)
This commit is contained in:
Mike FABIAN 2015-07-23 15:44:24 +02:00
parent 37b7dfda47
commit 706a051a42
6 changed files with 95 additions and 70 deletions

View File

@ -1,4 +1,4 @@
From e857f3e60cfb440cdaa7f933e9989d7b0a06e0e2 Mon Sep 17 00:00:00 2001
From 46f5006361350be7eb076c8c968e1cdfc4eaad26 Mon Sep 17 00:00:00 2001
From: Mike FABIAN <mfabian@redhat.com>
Date: Wed, 20 May 2015 11:16:30 +0200
Subject: [PATCH 4/5] Add transliteration rules for da, nb, nn, and sv locales.
@ -16,10 +16,10 @@ for localedata/Changelog
3 files changed, 62 insertions(+), 3 deletions(-)
diff --git a/localedata/locales/da_DK b/localedata/locales/da_DK
index c5024a4..d1d4087 100644
index a3c5561..3248e3c 100644
--- a/localedata/locales/da_DK
+++ b/localedata/locales/da_DK
@@ -137,11 +137,26 @@ translit_start
@@ -136,11 +136,26 @@ translit_start
include "translit_combining";""

View File

@ -1,4 +1,4 @@
From 0df7fd67146848ce3d61ddeffdd59f58bb408312 Mon Sep 17 00:00:00 2001
From 18f1d53b474ce60dbb943e08478e712e72ece02e Mon Sep 17 00:00:00 2001
From: Mike FABIAN <mfabian@redhat.com>
Date: Thu, 30 Apr 2015 16:44:03 +0200
Subject: [PATCH 2/5] Addition and fixes for translit_neutral
@ -15,14 +15,14 @@ Content-Transfer-Encoding: 8bit
- Add some more currency signs suggested by Marko Myllynen
- Add another patch with more characters by Marko Myllynen
---
localedata/locales/translit_neutral | 462 +++++++++++++++++++++++++++++++++++-
1 file changed, 457 insertions(+), 5 deletions(-)
localedata/locales/translit_neutral | 470 +++++++++++++++++++++++++++++++++++-
1 file changed, 465 insertions(+), 5 deletions(-)
diff --git a/localedata/locales/translit_neutral b/localedata/locales/translit_neutral
index b5cc079..a1a7472 100644
index b5cc079..c16d6f0 100644
--- a/localedata/locales/translit_neutral
+++ b/localedata/locales/translit_neutral
@@ -18,6 +18,16 @@ include "translit_wide";""
@@ -18,30 +18,306 @@ include "translit_wide";""
% NO-BREAK SPACE
<U00A0> <U0020>
@ -39,7 +39,10 @@ index b5cc079..a1a7472 100644
% COPYRIGHT SIGN
<U00A9> "<U0028><U0043><U0029>"
% LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
@@ -26,22 +36,284 @@ include "translit_wide";""
<U00AB> "<U003C><U003C>"
+% NOT SIGN
+<U00AC> <U0021>
% SOFT HYPHEN
<U00AD> <U002D>
% REGISTERED SIGN
<U00AE> "<U0028><U0052><U0029>"
@ -72,6 +75,8 @@ index b5cc079..a1a7472 100644
<U00E6> "<U0061><U0065>"
+% LATIN SMALL LETTER ETH
+<U00F0> <U0064>
+% DIVISION SIGN
+<U00F7> <U002F>
% LATIN SMALL LETTER O WITH STROKE
-<U00F8> "<U006F><U0065>"
+<U00F8> <U006F>
@ -326,7 +331,7 @@ index b5cc079..a1a7472 100644
% MODIFIER LETTER PRIME
<U02B9> <U2032>;<U00B4>
% MODIFIER LETTER DOUBLE PRIME
@@ -68,6 +340,136 @@ include "translit_wide";""
@@ -68,6 +344,138 @@ include "translit_wide";""
<U02D0> <U003A>
% SMALL TILDE
<U02DC> <U007E>
@ -442,6 +447,8 @@ index b5cc079..a1a7472 100644
+<U1D99> <U0075>
+% LATIN SMALL LETTER A WITH RIGHT HALF RING
+<U1E9A> <U0061>
+% LATIN SMALL LETTER LONG S WITH DOT ABOVE
+<U1E9B> <U0073>
+% LATIN SMALL LETTER LONG S WITH DIAGONAL STROKE
+<U1E9C> <U0073>
+% LATIN SMALL LETTER LONG S WITH HIGH STROKE
@ -463,7 +470,16 @@ index b5cc079..a1a7472 100644
% ZERO WIDTH SPACE
<U200B> ""
% HYPHEN
@@ -130,12 +532,36 @@ include "translit_wide";""
@@ -120,6 +528,8 @@ include "translit_wide";""
<U2039> <U003C>
% SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
<U203A> <U003E>
+% TIRONIAN SIGN ET
+<U204A> <U0026>
% FRACTION SLASH
<U2044> <U002F>
% WORD JOINER
@@ -130,12 +540,36 @@ include "translit_wide";""
<U2062> ""
% INVISIBLE SEPARATOR
<U2063> ""
@ -501,7 +517,7 @@ index b5cc079..a1a7472 100644
% TRADE MARK SIGN
<U2122> "<U0028><U0054><U004D><U0029>"
% OHM SIGN
@@ -150,10 +576,10 @@ include "translit_wide";""
@@ -150,10 +584,10 @@ include "translit_wide";""
<U2194> "<U003C><U002D><U003E>"
% LEFTWARDS DOUBLE ARROW
<U21D0> "<U003C><U003D>"
@ -514,7 +530,7 @@ index b5cc079..a1a7472 100644
% MINUS SIGN
<U2212> <U2013>;<U002D>
% DIVISION SLASH
@@ -166,6 +592,8 @@ include "translit_wide";""
@@ -166,6 +600,8 @@ include "translit_wide";""
<U2219> <U2022>;<U00B7>
% DIVIDES
<U2223> <U007C>
@ -523,7 +539,7 @@ index b5cc079..a1a7472 100644
% RATIO
<U2236> <U003A>
% TILDE OPERATOR
@@ -280,8 +708,32 @@ include "translit_wide";""
@@ -280,8 +716,32 @@ include "translit_wide";""
<U2534> <U002B>
% BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL
<U253C> <U002B>

View File

@ -1,4 +1,4 @@
From 8364315a1d6be5a5b47c7b5e5e678127001bca62 Mon Sep 17 00:00:00 2001
From 68370600530b6514877793193c1b0ab9a6f46acc Mon Sep 17 00:00:00 2001
From: Mike FABIAN <mfabian@redhat.com>
Date: Mon, 18 May 2015 07:03:13 +0200
Subject: [PATCH 1/5] Remove duplicate transliterations for U+0152 and U+0153

View File

@ -1,4 +1,4 @@
From 207b5d41e812f523e9dfd971a1d40e02eae84fed Mon Sep 17 00:00:00 2001
From ceebbb1989d33ca7a5a6827dbdbcc6579f275ecc Mon Sep 17 00:00:00 2001
From: Mike FABIAN <mfabian@redhat.com>
Date: Mon, 22 Jun 2015 13:00:44 +0200
Subject: [PATCH 5/5] Unicode 8.0.0 update.
@ -9912,10 +9912,10 @@ index 31c8a7e..aa0e914 100644
2F801;CJK COMPATIBILITY IDEOGRAPH-2F801;Lo;0;L;4E38;;;;N;;;;;
2F802;CJK COMPATIBILITY IDEOGRAPH-2F802;Lo;0;L;4E41;;;;N;;;;;
diff --git a/localedata/unicode-gen/gen_translit_combining.py b/localedata/unicode-gen/gen_translit_combining.py
index 07ed739..11250c6 100755
index 2551ce1..abcfeef 100755
--- a/localedata/unicode-gen/gen_translit_combining.py
+++ b/localedata/unicode-gen/gen_translit_combining.py
@@ -168,7 +168,9 @@ def is_combining_remove(code_point):
@@ -169,7 +169,9 @@ def is_combining_remove(code_point):
'PAHAWH HMONG',
'MIAO',
'DUPLOYAN',

View File

@ -1,4 +1,4 @@
From e1663b8dc7ca04db26a37d745bfe043df83d3a64 Mon Sep 17 00:00:00 2001
From ee7a6e89183bdc8453dd9a0dadf133e65deb9e0b Mon Sep 17 00:00:00 2001
From: Mike FABIAN <mfabian@redhat.com>
Date: Fri, 17 Apr 2015 09:12:05 +0200
Subject: [PATCH 3/5] Update the translit files to Unicode 7.0.0
@ -28,17 +28,17 @@ for localedata/ChangeLog
localedata/locales/translit_font | 151 ++++-
localedata/locales/translit_fraction | 15 +-
localedata/unicode-gen/Makefile | 42 +-
localedata/unicode-gen/gen_translit_circle.py | 149 +++++
localedata/unicode-gen/gen_translit_cjk_compat.py | 219 ++++++++
localedata/unicode-gen/gen_translit_combining.py | 441 +++++++++++++++
localedata/unicode-gen/gen_translit_compat.py | 325 +++++++++++
localedata/unicode-gen/gen_translit_font.py | 155 ++++++
localedata/unicode-gen/gen_translit_fraction.py | 196 +++++++
localedata/unicode-gen/gen_translit_circle.py | 150 +++++
localedata/unicode-gen/gen_translit_cjk_compat.py | 220 ++++++++
localedata/unicode-gen/gen_translit_combining.py | 442 +++++++++++++++
localedata/unicode-gen/gen_translit_compat.py | 326 +++++++++++
localedata/unicode-gen/gen_translit_font.py | 156 ++++++
localedata/unicode-gen/gen_translit_fraction.py | 197 +++++++
localedata/unicode-gen/gen_unicode_ctype.py | 497 +----------------
localedata/unicode-gen/unicode_utils.py | 502 +++++++++++++++++
localedata/unicode-gen/utf8_compatibility.py | 217 ++------
localedata/unicode-gen/utf8_gen.py | 28 +-
17 files changed, 3890 insertions(+), 713 deletions(-)
17 files changed, 3896 insertions(+), 713 deletions(-)
create mode 100755 localedata/unicode-gen/gen_translit_circle.py
create mode 100755 localedata/unicode-gen/gen_translit_cjk_compat.py
create mode 100755 localedata/unicode-gen/gen_translit_combining.py
@ -2395,11 +2395,12 @@ index 166ee31..920bf0e 100644
downloads: $(DOWNLOADS)
diff --git a/localedata/unicode-gen/gen_translit_circle.py b/localedata/unicode-gen/gen_translit_circle.py
new file mode 100755
index 0000000..a146e7f
index 0000000..6142859
--- /dev/null
+++ b/localedata/unicode-gen/gen_translit_circle.py
@@ -0,0 +1,149 @@
@@ -0,0 +1,150 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+#
+# Generate a translit_circle file from a UnicodeData file.
+# Copyright (C) 2015 Free Software Foundation, Inc.
@ -2465,9 +2466,9 @@ index 0000000..a146e7f
+ translit_file.write('\n')
+ translit_file.write('% Transliterations of encircled characters.\n')
+ translit_file.write('% Generated automatically from UnicodeData.txt '
+ + 'by gen_translit_circle.py '
+ + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
+ + 'for Unicode {:s}.\n'.format(unicode_version))
+ + 'by gen_translit_circle.py '
+ + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
+ + 'for Unicode {:s}.\n'.format(unicode_version))
+ translit_file.write('\n')
+ translit_file.write('LC_CTYPE\n')
+ translit_file.write('\n')
@ -2550,11 +2551,12 @@ index 0000000..a146e7f
+ output_tail(TRANSLIT_FILE, tail=TAIL)
diff --git a/localedata/unicode-gen/gen_translit_cjk_compat.py b/localedata/unicode-gen/gen_translit_cjk_compat.py
new file mode 100755
index 0000000..a87d546
index 0000000..627ff6b
--- /dev/null
+++ b/localedata/unicode-gen/gen_translit_cjk_compat.py
@@ -0,0 +1,219 @@
@@ -0,0 +1,220 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+#
+# Generate a translit_cjk_compat file from a UnicodeData file.
+# Copyright (C) 2015 Free Software Foundation, Inc.
@ -2622,9 +2624,9 @@ index 0000000..a87d546
+ translit_file.write('% Transliterations of CJK compatibility ')
+ translit_file.write('characters.\n')
+ translit_file.write('% Generated automatically from UnicodeData.txt '
+ + 'by gen_translit_cjk_compat.py '
+ + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
+ + 'for Unicode {:s}.\n'.format(unicode_version))
+ + 'by gen_translit_cjk_compat.py '
+ + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
+ + 'for Unicode {:s}.\n'.format(unicode_version))
+ translit_file.write('\n')
+ translit_file.write('LC_CTYPE\n')
+ translit_file.write('\n')
@ -2652,7 +2654,7 @@ index 0000000..a87d546
+ (0x00B2,): [0x005E, 0x0032], # ² → ^2
+ (0x03BC,): [0x00B5], # μ → µ (GREEK SMALL LETTER MU → MICRO SIGN)
+ (0x2113,): [0x006C], # → l
+ (0x00B3,): [0x005E, 0x0033], # ³ → ^3
+ (0x00B3,): [0x005E, 0x0033], # ³ → ^3
+ (0x00B5,): [0x0075], # µ → u
+ (0x03BC, 0x2113): [0x03BC, 0x006C], # μℓ → μl
+ (0x0072, 0x0061, 0x0064, 0x2215, 0x0073, 0x00B2): [
@ -2680,7 +2682,7 @@ index 0000000..a87d546
+ special_decomposed_code_points = special_decompose(
+ decomposed_code_points[-1])
+ if (special_decomposed_code_points
+ != decomposed_code_points[-1]):
+ != decomposed_code_points[-1]):
+ decomposed_code_points.append(
+ special_decomposed_code_points)
+ continue
@ -2689,7 +2691,7 @@ index 0000000..a87d546
+ special_decomposed_code_points += special_decompose(
+ [decomposed_code_point])
+ if (special_decomposed_code_points
+ == decomposed_code_points[-1]):
+ == decomposed_code_points[-1]):
+ break
+ decomposed_code_points.append(
+ special_decomposed_code_points)
@ -2775,11 +2777,12 @@ index 0000000..a87d546
+ output_tail(TRANSLIT_FILE, tail=TAIL)
diff --git a/localedata/unicode-gen/gen_translit_combining.py b/localedata/unicode-gen/gen_translit_combining.py
new file mode 100755
index 0000000..07ed739
index 0000000..2551ce1
--- /dev/null
+++ b/localedata/unicode-gen/gen_translit_combining.py
@@ -0,0 +1,441 @@
@@ -0,0 +1,442 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+#
+# Generate a translit_combining file from a UnicodeData file.
+# Copyright (C) 2015 Free Software Foundation, Inc.
@ -2847,9 +2850,9 @@ index 0000000..07ed739
+ translit_file.write('combining characters (accents,\n')
+ translit_file.write('% pronounciation marks, etc.).\n')
+ translit_file.write('% Generated automatically from UnicodeData.txt '
+ + 'by gen_translit_combining.py '
+ + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
+ + 'for Unicode {:s}.\n'.format(unicode_version))
+ + 'by gen_translit_combining.py '
+ + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
+ + 'for Unicode {:s}.\n'.format(unicode_version))
+ translit_file.write('\n')
+ translit_file.write('LC_CTYPE\n')
+ translit_file.write('\n')
@ -3061,7 +3064,7 @@ index 0000000..07ed739
+ (0x2226,): [0x0021, 0x2225], # ∦ → !∥
+ (0x2241,): [0x0021, 0x007E], # ≁ → !~
+ (0x2244,): [0x0021, 0x007E, 0x002D], # ≄ → !~-
+ (0x2247,): [0x0021, 0x007E, 0x003D ], # ≇ → !~=
+ (0x2247,): [0x0021, 0x007E, 0x003D], # ≇ → !~=
+ (0x2249,): [0x0021, 0x007E, 0x007E], # ≉ → !~~
+ (0x2260,): [0x0021, 0x003D], # ≠ → !=
+ (0x2262,): [0x0021, 0x003D, 0x003D], # ≢ → !==
@ -3135,7 +3138,7 @@ index 0000000..07ed739
+ special_decomposed_code_points = special_decompose(
+ decomposed_code_points[-1])
+ if (special_decomposed_code_points
+ != decomposed_code_points[-1]):
+ != decomposed_code_points[-1]):
+ decomposed_code_points.append(
+ special_decomposed_code_points)
+ continue
@ -3144,7 +3147,7 @@ index 0000000..07ed739
+ special_decomposed_code_points += special_decompose(
+ [decomposed_code_point])
+ if (special_decomposed_code_points
+ == decomposed_code_points[-1]):
+ == decomposed_code_points[-1]):
+ break
+ decomposed_code_points.append(
+ special_decomposed_code_points)
@ -3222,11 +3225,12 @@ index 0000000..07ed739
+ output_tail(TRANSLIT_FILE, tail=TAIL)
diff --git a/localedata/unicode-gen/gen_translit_compat.py b/localedata/unicode-gen/gen_translit_compat.py
new file mode 100755
index 0000000..d99e56d
index 0000000..0e824a8
--- /dev/null
+++ b/localedata/unicode-gen/gen_translit_compat.py
@@ -0,0 +1,325 @@
@@ -0,0 +1,326 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+#
+# Generate a translit_compat file from a UnicodeData file.
+# Copyright (C) 2015 Free Software Foundation, Inc.
@ -3293,9 +3297,9 @@ index 0000000..d99e56d
+ translit_file.write('% Transliterations of compatibility characters ')
+ translit_file.write('and ligatures.\n')
+ translit_file.write('% Generated automatically from UnicodeData.txt '
+ + 'by gen_translit_compat.py '
+ + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
+ + 'for Unicode {:s}.\n'.format(unicode_version))
+ + 'by gen_translit_compat.py '
+ + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
+ + 'for Unicode {:s}.\n'.format(unicode_version))
+ translit_file.write('\n')
+ translit_file.write('LC_CTYPE\n')
+ translit_file.write('\n')
@ -3347,9 +3351,9 @@ index 0000000..d99e56d
+ decomposed_code_points = [int(x, 16)
+ for x in decomposition.split(' ')]
+ if (len(decomposed_code_points) > 1
+ and decomposed_code_points[0] == 0x0020
+ and decomposed_code_points[1] >= 0x0300
+ and decomposed_code_points[1] <= 0x03FF):
+ and decomposed_code_points[0] == 0x0020
+ and decomposed_code_points[1] >= 0x0300
+ and decomposed_code_points[1] <= 0x03FF):
+ # Decomposes into a space followed by a combining character.
+ # This is not useful fo transliteration.
+ return []
@ -3463,7 +3467,7 @@ index 0000000..d99e56d
+ special_decomposed_code_points = special_decompose(
+ decomposed_code_points[-1])
+ if (special_decomposed_code_points
+ != decomposed_code_points[-1]):
+ != decomposed_code_points[-1]):
+ decomposed_code_points.append(
+ special_decomposed_code_points)
+ continue
@ -3472,7 +3476,7 @@ index 0000000..d99e56d
+ special_decomposed_code_points += special_decompose(
+ [decomposed_code_point])
+ if (special_decomposed_code_points
+ == decomposed_code_points[-1]):
+ == decomposed_code_points[-1]):
+ break
+ decomposed_code_points.append(
+ special_decomposed_code_points)
@ -3553,11 +3557,12 @@ index 0000000..d99e56d
+ output_tail(TRANSLIT_FILE, tail=TAIL)
diff --git a/localedata/unicode-gen/gen_translit_font.py b/localedata/unicode-gen/gen_translit_font.py
new file mode 100755
index 0000000..c7ec509
index 0000000..0723622
--- /dev/null
+++ b/localedata/unicode-gen/gen_translit_font.py
@@ -0,0 +1,155 @@
@@ -0,0 +1,156 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+#
+# Generate a translit_font file from a UnicodeData file.
+# Copyright (C) 2015 Free Software Foundation, Inc.
@ -3623,9 +3628,9 @@ index 0000000..c7ec509
+ translit_file.write('\n')
+ translit_file.write('% Transliterations of font equivalents.\n')
+ translit_file.write('% Generated automatically from UnicodeData.txt '
+ + 'by gen_translit_font.py '
+ + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
+ + 'for Unicode {:s}.\n'.format(unicode_version))
+ + 'by gen_translit_font.py '
+ + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
+ + 'for Unicode {:s}.\n'.format(unicode_version))
+ translit_file.write('\n')
+ translit_file.write('LC_CTYPE\n')
+ translit_file.write('\n')
@ -3714,11 +3719,12 @@ index 0000000..c7ec509
+ output_tail(TRANSLIT_FILE, tail=TAIL)
diff --git a/localedata/unicode-gen/gen_translit_fraction.py b/localedata/unicode-gen/gen_translit_fraction.py
new file mode 100755
index 0000000..bf460f2
index 0000000..5bf63ea
--- /dev/null
+++ b/localedata/unicode-gen/gen_translit_fraction.py
@@ -0,0 +1,196 @@
@@ -0,0 +1,197 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+#
+# Generate a translit_fraction file from a UnicodeData file.
+# Copyright (C) 2015 Free Software Foundation, Inc.
@ -3784,9 +3790,9 @@ index 0000000..bf460f2
+ translit_file.write('\n')
+ translit_file.write('% Transliterations of fractions.\n')
+ translit_file.write('% Generated automatically from UnicodeData.txt '
+ + 'by gen_translit_fraction.py '
+ + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
+ + 'for Unicode {:s}.\n'.format(unicode_version))
+ + 'by gen_translit_fraction.py '
+ + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
+ + 'for Unicode {:s}.\n'.format(unicode_version))
+ translit_file.write('% The replacements have been surrounded ')
+ translit_file.write('with spaces, because fractions are\n')
+ translit_file.write('% often preceded by a decimal number and ')
@ -3840,7 +3846,7 @@ index 0000000..bf460f2
+ special_decomposed_code_points = special_decompose(
+ decomposed_code_points[-1])
+ if (special_decomposed_code_points
+ != decomposed_code_points[-1]):
+ != decomposed_code_points[-1]):
+ decomposed_code_points.append(
+ special_decomposed_code_points)
+ continue
@ -3849,7 +3855,7 @@ index 0000000..bf460f2
+ special_decomposed_code_points += special_decompose(
+ [decomposed_code_point])
+ if (special_decomposed_code_points
+ == decomposed_code_points[-1]):
+ == decomposed_code_points[-1]):
+ break
+ decomposed_code_points.append(
+ special_decomposed_code_points)

View File

@ -1,6 +1,6 @@
%define glibcsrcdir glibc-2.21-357-gb40a4e1
%define glibcversion 2.21.90
%define glibcrelease 18%{?dist}
%define glibcrelease 19%{?dist}
# Pre-release tarballs are pulled in from git using a command that is
# effectively:
#
@ -1840,6 +1840,9 @@ rm -f *.filelist*
%endif
%changelog
* Thu Jul 23 2015 Mike FABIAN <mfabian@redhat.com> - 2.21.90-19
- some more additions to the translit_neutral file by Marko Myllynen
* Tue Jul 14 2015 Mike FABIAN <mfabian@redhat.com> - 2.21.90-18
- Unicode 8.0.0 updates, including the transliteration files (#1238412).