From e82b9306ac1ed717a0885b031fb2cc1f22cf7681 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20P=C3=ADsa=C5=99?= Date: Wed, 5 Sep 2018 15:54:07 +0200 Subject: [PATCH] Fix matching an ASCII digit followed by a non-ASCII digit using a script run --- ...ipt-run-bug-1-followed-by-Thai-digit.patch | 100 ++++++++++++++++++ perl.spec | 7 ++ 2 files changed, 107 insertions(+) create mode 100644 perl-5.28.0-Fix-script-run-bug-1-followed-by-Thai-digit.patch diff --git a/perl-5.28.0-Fix-script-run-bug-1-followed-by-Thai-digit.patch b/perl-5.28.0-Fix-script-run-bug-1-followed-by-Thai-digit.patch new file mode 100644 index 0000000..480d731 --- /dev/null +++ b/perl-5.28.0-Fix-script-run-bug-1-followed-by-Thai-digit.patch @@ -0,0 +1,100 @@ +From 7b4a3fe1d488df004e3969802fe121697cd3d6e5 Mon Sep 17 00:00:00 2001 +From: Karl Williamson +Date: Thu, 16 Aug 2018 16:14:01 -0600 +Subject: [PATCH] Fix script run bug '1' followed by Thai digit +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This does not have a ticket, but was pointed out in +http://nntp.perl.org/group/perl.perl5.porters/251870 + +The logic for deciding if it was needed to check if a character is a +digit was flawed. + +Petr Písař: Ported to 5.28.0. + +Signed-off-by: Petr Písař +--- + regexec.c | 46 +++++++++++++++++++++++++++++++--------------- + t/re/script_run.t | 5 +++++ + 2 files changed, 36 insertions(+), 15 deletions(-) + +diff --git a/regexec.c b/regexec.c +index 95bb254..d1a3937 100644 +--- a/regexec.c ++++ b/regexec.c +@@ -10599,23 +10599,39 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target) + scripts_match: + + /* Here, the script of the character is compatible with that of the +- * run. Either they match exactly, or one or both can be any of +- * several scripts, and the intersection is not empty. If the +- * character is not a decimal digit, we are done with it. Otherwise, +- * it could still fail if it is from a different set of 10 than seen +- * already (or we may not have seen any, and we need to set the +- * sequence). If we have determined a single script and that script +- * only has one set of digits (almost all scripts are like that), then +- * this isn't a problem, as any digit must come from the same sequence. +- * The only scripts that have multiple sequences have been constructed +- * to be 0 in 'script_zeros[]'. ++ * run. That means that in most cases, it continues the script run. ++ * Either it and the run match exactly, or one or both can be in any of ++ * several scripts, and the intersection is not empty. But if the ++ * character is a decimal digit, we need further handling. If we ++ * haven't seen a digit before, it would establish what set of 10 all ++ * must come from; and if we have established a set, we need to check ++ * that this is in it. + * +- * Here we check if it is a digit. */ ++ * But there are cases we can rule out without having to look up if ++ * this is a digit: ++ * a. All instances of [0-9] have been dealt with earlier. ++ * b. The next digit encoded by Unicode is 1600 code points further ++ * on, so if the code point in this loop iteration is less than ++ * that, it isn't a digit. ++ * c. Most scripts that have digits have a single set of 10. If ++ * we've encountered a digit in such a script, 'zero_of_run' is ++ * set to the code point (call it z) whose numeric value is 0. ++ * If the code point in this loop iteration is in the range ++ * z..z+9, it is in the script's set of 10, and we've actually ++ * handled it earlier in this function and won't reach this ++ * point. But, code points in that script that aren't in that ++ * range can't be digits, so we don't have to look any such up. ++ * We can tell if this script is such a one by looking at ++ * 'script_zeros[]' for it. It is non-zero iff it has a single ++ * set of digits. This rule doesn't apply if we haven't narrowed ++ * down the possible scripts to a single one yet. Nor if the ++ * zero of the run is '0', as that also hasn't narrowed things ++ * down completely */ + if ( cp >= FIRST_NON_ASCII_DECIMAL_DIGIT +- && ( ( zero_of_run == 0 +- || ( ( script_of_char >= 0 +- && script_zeros[script_of_char] == 0) +- || intersection)))) ++ && ( intersection ++ || script_of_char < 0 /* Also implies an intersection */ ++ || zero_of_run == '0' ++ || script_zeros[script_of_char] == 0)) + { + SSize_t range_zero_index; + range_zero_index = _invlist_search(decimals_invlist, cp); +diff --git a/t/re/script_run.t b/t/re/script_run.t +index ca234d9..10c7103 100644 +--- a/t/re/script_run.t ++++ b/t/re/script_run.t +@@ -84,6 +84,11 @@ foreach my $type ('script_run', 'sr', 'atomic_script_run', 'asr') { + + # From UTS 39 + like("写真だけの結婚式", $script_run, "Mixed Hiragana and Han"); ++ ++ unlike "\N{THAI DIGIT FIVE}1", $script_run, "Thai digit followed by '1'"; ++ unlike "1\N{THAI DIGIT FIVE}", $script_run, "'1' followed by Thai digit "; ++ unlike "\N{BENGALI DIGIT ZERO}\N{CHAKMA DIGIT SEVEN}", $script_run, ++ "Two digits in same extended script but from different sets of 10"; + } + + # Until fixed, this was skipping the '[' +-- +2.14.4 + diff --git a/perl.spec b/perl.spec index 4c473c0..3519aa0 100644 --- a/perl.spec +++ b/perl.spec @@ -190,6 +190,10 @@ Patch26: perl-5.29.1-Make-utf8_to_uvchr-slightly-safer.patch # Fix a time race in Time-HiRes/t/itimer.t test, in upstream after 5.29.1 Patch27: perl-5.29.1-Time-HiRes-t-itimer.t-avoid-race-condition.patch +# Fix matching an ASCII digit followed by a non-ASCII digit using a script +# run, in upstream after 5.29.1 +Patch28: perl-5.28.0-Fix-script-run-bug-1-followed-by-Thai-digit.patch + # Link XS modules to libperl.so with EU::CBuilder on Linux, bug #960048 Patch200: perl-5.16.3-Link-XS-modules-to-libperl.so-with-EU-CBuilder-on-Li.patch @@ -2769,6 +2773,7 @@ Perl extension for Version Objects %patch25 -p1 %patch26 -p1 %patch27 -p1 +%patch28 -p1 %patch200 -p1 %patch201 -p1 @@ -2803,6 +2808,7 @@ perl -x patchlevel.h \ 'Fedora Patch25: Fix a buffer overrun in deprecated utf8_to_uvchr()' \ 'Fedora Patch26: Fix a buffer overrun in deprecated utf8_to_uvchr()' \ 'Fedora Patch27: Fix a time race in Time-HiRes/t/itimer.t test' \ + 'Fedora Patch28: Fix matching an ASCII digit followed by a non-ASCII digit using a script run' \ 'Fedora Patch200: Link XS modules to libperl.so with EU::CBuilder on Linux' \ 'Fedora Patch201: Link XS modules to libperl.so with EU::MM on Linux' \ %{nil} @@ -5095,6 +5101,7 @@ popd - Fix a buffer overrun in deprecated S_is_utf8_common() - Fix a buffer overrun in deprecated utf8_to_uvchr() - Fix a time race in Time-HiRes/t/itimer.t test +- Fix matching an ASCII digit followed by a non-ASCII digit using a script run * Wed Aug 01 2018 Petr Pisar - 4:5.28.0-420 - Fix a file descriptor leak in in-place edits (RT#133314)