Fix matching an ASCII digit followed by a non-ASCII digit using a script run
This commit is contained in:
parent
e039a7964c
commit
e82b9306ac
100
perl-5.28.0-Fix-script-run-bug-1-followed-by-Thai-digit.patch
Normal file
100
perl-5.28.0-Fix-script-run-bug-1-followed-by-Thai-digit.patch
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
From 7b4a3fe1d488df004e3969802fe121697cd3d6e5 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Karl Williamson <khw@cpan.org>
|
||||||
|
Date: Thu, 16 Aug 2018 16:14:01 -0600
|
||||||
|
Subject: [PATCH] Fix script run bug '1' followed by Thai digit
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
This does not have a ticket, but was pointed out in
|
||||||
|
http://nntp.perl.org/group/perl.perl5.porters/251870
|
||||||
|
|
||||||
|
The logic for deciding if it was needed to check if a character is a
|
||||||
|
digit was flawed.
|
||||||
|
|
||||||
|
Petr Písař: Ported to 5.28.0.
|
||||||
|
|
||||||
|
Signed-off-by: Petr Písař <ppisar@redhat.com>
|
||||||
|
---
|
||||||
|
regexec.c | 46 +++++++++++++++++++++++++++++++---------------
|
||||||
|
t/re/script_run.t | 5 +++++
|
||||||
|
2 files changed, 36 insertions(+), 15 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/regexec.c b/regexec.c
|
||||||
|
index 95bb254..d1a3937 100644
|
||||||
|
--- a/regexec.c
|
||||||
|
+++ b/regexec.c
|
||||||
|
@@ -10599,23 +10599,39 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
|
||||||
|
scripts_match:
|
||||||
|
|
||||||
|
/* Here, the script of the character is compatible with that of the
|
||||||
|
- * run. Either they match exactly, or one or both can be any of
|
||||||
|
- * several scripts, and the intersection is not empty. If the
|
||||||
|
- * character is not a decimal digit, we are done with it. Otherwise,
|
||||||
|
- * it could still fail if it is from a different set of 10 than seen
|
||||||
|
- * already (or we may not have seen any, and we need to set the
|
||||||
|
- * sequence). If we have determined a single script and that script
|
||||||
|
- * only has one set of digits (almost all scripts are like that), then
|
||||||
|
- * this isn't a problem, as any digit must come from the same sequence.
|
||||||
|
- * The only scripts that have multiple sequences have been constructed
|
||||||
|
- * to be 0 in 'script_zeros[]'.
|
||||||
|
+ * run. That means that in most cases, it continues the script run.
|
||||||
|
+ * Either it and the run match exactly, or one or both can be in any of
|
||||||
|
+ * several scripts, and the intersection is not empty. But if the
|
||||||
|
+ * character is a decimal digit, we need further handling. If we
|
||||||
|
+ * haven't seen a digit before, it would establish what set of 10 all
|
||||||
|
+ * must come from; and if we have established a set, we need to check
|
||||||
|
+ * that this is in it.
|
||||||
|
*
|
||||||
|
- * Here we check if it is a digit. */
|
||||||
|
+ * But there are cases we can rule out without having to look up if
|
||||||
|
+ * this is a digit:
|
||||||
|
+ * a. All instances of [0-9] have been dealt with earlier.
|
||||||
|
+ * b. The next digit encoded by Unicode is 1600 code points further
|
||||||
|
+ * on, so if the code point in this loop iteration is less than
|
||||||
|
+ * that, it isn't a digit.
|
||||||
|
+ * c. Most scripts that have digits have a single set of 10. If
|
||||||
|
+ * we've encountered a digit in such a script, 'zero_of_run' is
|
||||||
|
+ * set to the code point (call it z) whose numeric value is 0.
|
||||||
|
+ * If the code point in this loop iteration is in the range
|
||||||
|
+ * z..z+9, it is in the script's set of 10, and we've actually
|
||||||
|
+ * handled it earlier in this function and won't reach this
|
||||||
|
+ * point. But, code points in that script that aren't in that
|
||||||
|
+ * range can't be digits, so we don't have to look any such up.
|
||||||
|
+ * We can tell if this script is such a one by looking at
|
||||||
|
+ * 'script_zeros[]' for it. It is non-zero iff it has a single
|
||||||
|
+ * set of digits. This rule doesn't apply if we haven't narrowed
|
||||||
|
+ * down the possible scripts to a single one yet. Nor if the
|
||||||
|
+ * zero of the run is '0', as that also hasn't narrowed things
|
||||||
|
+ * down completely */
|
||||||
|
if ( cp >= FIRST_NON_ASCII_DECIMAL_DIGIT
|
||||||
|
- && ( ( zero_of_run == 0
|
||||||
|
- || ( ( script_of_char >= 0
|
||||||
|
- && script_zeros[script_of_char] == 0)
|
||||||
|
- || intersection))))
|
||||||
|
+ && ( intersection
|
||||||
|
+ || script_of_char < 0 /* Also implies an intersection */
|
||||||
|
+ || zero_of_run == '0'
|
||||||
|
+ || script_zeros[script_of_char] == 0))
|
||||||
|
{
|
||||||
|
SSize_t range_zero_index;
|
||||||
|
range_zero_index = _invlist_search(decimals_invlist, cp);
|
||||||
|
diff --git a/t/re/script_run.t b/t/re/script_run.t
|
||||||
|
index ca234d9..10c7103 100644
|
||||||
|
--- a/t/re/script_run.t
|
||||||
|
+++ b/t/re/script_run.t
|
||||||
|
@@ -84,6 +84,11 @@ foreach my $type ('script_run', 'sr', 'atomic_script_run', 'asr') {
|
||||||
|
|
||||||
|
# From UTS 39
|
||||||
|
like("写真だけの結婚式", $script_run, "Mixed Hiragana and Han");
|
||||||
|
+
|
||||||
|
+ unlike "\N{THAI DIGIT FIVE}1", $script_run, "Thai digit followed by '1'";
|
||||||
|
+ unlike "1\N{THAI DIGIT FIVE}", $script_run, "'1' followed by Thai digit ";
|
||||||
|
+ unlike "\N{BENGALI DIGIT ZERO}\N{CHAKMA DIGIT SEVEN}", $script_run,
|
||||||
|
+ "Two digits in same extended script but from different sets of 10";
|
||||||
|
}
|
||||||
|
|
||||||
|
# Until fixed, this was skipping the '['
|
||||||
|
--
|
||||||
|
2.14.4
|
||||||
|
|
@ -190,6 +190,10 @@ Patch26: perl-5.29.1-Make-utf8_to_uvchr-slightly-safer.patch
|
|||||||
# Fix a time race in Time-HiRes/t/itimer.t test, in upstream after 5.29.1
|
# Fix a time race in Time-HiRes/t/itimer.t test, in upstream after 5.29.1
|
||||||
Patch27: perl-5.29.1-Time-HiRes-t-itimer.t-avoid-race-condition.patch
|
Patch27: perl-5.29.1-Time-HiRes-t-itimer.t-avoid-race-condition.patch
|
||||||
|
|
||||||
|
# Fix matching an ASCII digit followed by a non-ASCII digit using a script
|
||||||
|
# run, in upstream after 5.29.1
|
||||||
|
Patch28: perl-5.28.0-Fix-script-run-bug-1-followed-by-Thai-digit.patch
|
||||||
|
|
||||||
# Link XS modules to libperl.so with EU::CBuilder on Linux, bug #960048
|
# Link XS modules to libperl.so with EU::CBuilder on Linux, bug #960048
|
||||||
Patch200: perl-5.16.3-Link-XS-modules-to-libperl.so-with-EU-CBuilder-on-Li.patch
|
Patch200: perl-5.16.3-Link-XS-modules-to-libperl.so-with-EU-CBuilder-on-Li.patch
|
||||||
|
|
||||||
@ -2769,6 +2773,7 @@ Perl extension for Version Objects
|
|||||||
%patch25 -p1
|
%patch25 -p1
|
||||||
%patch26 -p1
|
%patch26 -p1
|
||||||
%patch27 -p1
|
%patch27 -p1
|
||||||
|
%patch28 -p1
|
||||||
%patch200 -p1
|
%patch200 -p1
|
||||||
%patch201 -p1
|
%patch201 -p1
|
||||||
|
|
||||||
@ -2803,6 +2808,7 @@ perl -x patchlevel.h \
|
|||||||
'Fedora Patch25: Fix a buffer overrun in deprecated utf8_to_uvchr()' \
|
'Fedora Patch25: Fix a buffer overrun in deprecated utf8_to_uvchr()' \
|
||||||
'Fedora Patch26: Fix a buffer overrun in deprecated utf8_to_uvchr()' \
|
'Fedora Patch26: Fix a buffer overrun in deprecated utf8_to_uvchr()' \
|
||||||
'Fedora Patch27: Fix a time race in Time-HiRes/t/itimer.t test' \
|
'Fedora Patch27: Fix a time race in Time-HiRes/t/itimer.t test' \
|
||||||
|
'Fedora Patch28: Fix matching an ASCII digit followed by a non-ASCII digit using a script run' \
|
||||||
'Fedora Patch200: Link XS modules to libperl.so with EU::CBuilder on Linux' \
|
'Fedora Patch200: Link XS modules to libperl.so with EU::CBuilder on Linux' \
|
||||||
'Fedora Patch201: Link XS modules to libperl.so with EU::MM on Linux' \
|
'Fedora Patch201: Link XS modules to libperl.so with EU::MM on Linux' \
|
||||||
%{nil}
|
%{nil}
|
||||||
@ -5095,6 +5101,7 @@ popd
|
|||||||
- Fix a buffer overrun in deprecated S_is_utf8_common()
|
- Fix a buffer overrun in deprecated S_is_utf8_common()
|
||||||
- Fix a buffer overrun in deprecated utf8_to_uvchr()
|
- Fix a buffer overrun in deprecated utf8_to_uvchr()
|
||||||
- Fix a time race in Time-HiRes/t/itimer.t test
|
- Fix a time race in Time-HiRes/t/itimer.t test
|
||||||
|
- Fix matching an ASCII digit followed by a non-ASCII digit using a script run
|
||||||
|
|
||||||
* Wed Aug 01 2018 Petr Pisar <ppisar@redhat.com> - 4:5.28.0-420
|
* Wed Aug 01 2018 Petr Pisar <ppisar@redhat.com> - 4:5.28.0-420
|
||||||
- Fix a file descriptor leak in in-place edits (RT#133314)
|
- Fix a file descriptor leak in in-place edits (RT#133314)
|
||||||
|
Loading…
Reference in New Issue
Block a user