185 lines
8.6 KiB
Diff
185 lines
8.6 KiB
Diff
From a824afe95b6272148dce1f8bf4bcd20a667412e6 Mon Sep 17 00:00:00 2001
|
|
From: Karl Williamson <khw@cpan.org>
|
|
Date: Sun, 30 Sep 2018 10:38:02 -0600
|
|
Subject: [PATCH] PATCH: [perl #133547]: script run broken
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
All scripts can have the ASCII digits for their numbers. Scripts with
|
|
their own digits can alternatively use those. Only one of these two
|
|
sets can be used in a script run. The decision as to which set to use
|
|
must be deferred until the first digit is encountered, as otherwise we
|
|
don't know which set will be used. Prior to this commit, the decision
|
|
was being made prematurely in some cases. As a result of this change,
|
|
the non-ASCII-digits in the Common script need to be special-cased, and
|
|
different criteria are used to decide if we need to look up whether a
|
|
character is a digit or not.
|
|
|
|
Petr Písař: Ported to 5.28.1 from
|
|
393e5a4585b92e635cfc4eee34da8f86f3bfd2af.
|
|
|
|
Signed-off-by: Petr Písař <ppisar@redhat.com>
|
|
---
|
|
regexec.c | 111 +++++++++++++++++++++++-----------------------
|
|
t/re/script_run.t | 5 +++
|
|
2 files changed, 61 insertions(+), 55 deletions(-)
|
|
|
|
diff --git a/regexec.c b/regexec.c
|
|
index 899d979..201d9aa 100644
|
|
--- a/regexec.c
|
|
+++ b/regexec.c
|
|
@@ -10323,6 +10323,10 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
|
|
|
|
/* Look at each character in the sequence */
|
|
while (s < send) {
|
|
+ /* If the current character being examined is a digit, this is the code
|
|
+ * point of the zero for its sequence of 10 */
|
|
+ UV zero_of_char;
|
|
+
|
|
UV cp;
|
|
|
|
/* The code allows all scripts to use the ASCII digits. This is
|
|
@@ -10434,16 +10438,6 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
|
|
script_of_run = script_of_char;
|
|
}
|
|
|
|
- /* All decimal digits must be from the same sequence of 10. Above, we
|
|
- * handled any ASCII digits without descending to here. We also
|
|
- * handled the case where we already knew what digit sequence is the
|
|
- * one to use, and the character is in that sequence. Now that we know
|
|
- * the script, we can use script_zeros[] to directly find which
|
|
- * sequence the script uses, except in a few cases it returns 0 */
|
|
- if (UNLIKELY(zero_of_run == 0) && script_of_char >= 0) {
|
|
- zero_of_run = script_zeros[script_of_char];
|
|
- }
|
|
-
|
|
/* Now we can see if the script of the character is the same as that of
|
|
* the run */
|
|
if (LIKELY(script_of_char == script_of_run)) {
|
|
@@ -10601,55 +10595,62 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
|
|
/* Here, the script of the character is compatible with that of the
|
|
* run. That means that in most cases, it continues the script run.
|
|
* Either it and the run match exactly, or one or both can be in any of
|
|
- * several scripts, and the intersection is not empty. But if the
|
|
- * character is a decimal digit, we need further handling. If we
|
|
- * haven't seen a digit before, it would establish what set of 10 all
|
|
- * must come from; and if we have established a set, we need to check
|
|
- * that this is in it.
|
|
- *
|
|
- * But there are cases we can rule out without having to look up if
|
|
- * this is a digit:
|
|
- * a. All instances of [0-9] have been dealt with earlier.
|
|
- * b. The next digit encoded by Unicode is 1600 code points further
|
|
- * on, so if the code point in this loop iteration is less than
|
|
- * that, it isn't a digit.
|
|
- * c. Most scripts that have digits have a single set of 10. If
|
|
- * we've encountered a digit in such a script, 'zero_of_run' is
|
|
- * set to the code point (call it z) whose numeric value is 0.
|
|
- * If the code point in this loop iteration is in the range
|
|
- * z..z+9, it is in the script's set of 10, and we've actually
|
|
- * handled it earlier in this function and won't reach this
|
|
- * point. But, code points in that script that aren't in that
|
|
- * range can't be digits, so we don't have to look any such up.
|
|
- * We can tell if this script is such a one by looking at
|
|
- * 'script_zeros[]' for it. It is non-zero iff it has a single
|
|
- * set of digits. This rule doesn't apply if we haven't narrowed
|
|
- * down the possible scripts to a single one yet. Nor if the
|
|
- * zero of the run is '0', as that also hasn't narrowed things
|
|
- * down completely */
|
|
- if ( cp >= FIRST_NON_ASCII_DECIMAL_DIGIT
|
|
- && ( intersection
|
|
- || script_of_char < 0 /* Also implies an intersection */
|
|
- || zero_of_run == '0'
|
|
- || script_zeros[script_of_char] == 0))
|
|
+ * several scripts, and the intersection is not empty. However, if the
|
|
+ * character is a decimal digit, it could still mean failure if it is
|
|
+ * from the wrong sequence of 10. So, we need to look at if it's a
|
|
+ * digit. We've already handled the 10 decimal digits, and the next
|
|
+ * lowest one is this one: */
|
|
+ if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
|
|
+ continue; /* Not a digit; this character is part of the run */
|
|
+ }
|
|
+
|
|
+ /* If we have a definitive '0' for the script of this character, we
|
|
+ * know that for this to be a digit, it must be in the range of +0..+9
|
|
+ * of that zero. */
|
|
+ if ( script_of_char >= 0
|
|
+ && (zero_of_char = script_zeros[script_of_char]))
|
|
{
|
|
- SSize_t zero_of_char_index;
|
|
- zero_of_char_index = _invlist_search(decimals_invlist, cp);
|
|
- if ( LIKELY(zero_of_char_index >= 0)
|
|
- && ELEMENT_RANGE_MATCHES_INVLIST(zero_of_char_index))
|
|
+ if ( cp < zero_of_char
|
|
+ || cp > zero_of_char + 9)
|
|
{
|
|
- UV zero_of_char = decimals_array[zero_of_char_index];
|
|
- if (zero_of_run) {
|
|
- if (zero_of_run != zero_of_char) {
|
|
- retval = FALSE;
|
|
- break;
|
|
- }
|
|
- }
|
|
- else {
|
|
- zero_of_run = zero_of_char;
|
|
- }
|
|
+ continue; /* Not a digit; this character is part of the run
|
|
+ */
|
|
+ }
|
|
+
|
|
+ }
|
|
+ else { /* Need to look up if this character is a digit or not */
|
|
+ SSize_t index_of_zero_of_char;
|
|
+ index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
|
|
+ if ( UNLIKELY(index_of_zero_of_char < 0)
|
|
+ || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
|
|
+ {
|
|
+ continue; /* Not a digit; this character is part of the run.
|
|
+ */
|
|
+ }
|
|
+
|
|
+ zero_of_char = decimals_array[index_of_zero_of_char];
|
|
+ }
|
|
+
|
|
+ /* Here, the character is a decimal digit, and the zero of its sequence
|
|
+ * of 10 is in 'zero_of_char'. If we already have a zero for this run,
|
|
+ * they better be the same. */
|
|
+ if (zero_of_run) {
|
|
+ if (zero_of_run != zero_of_char) {
|
|
+ retval = FALSE;
|
|
+ break;
|
|
}
|
|
}
|
|
+ else if (script_of_char == SCX_Common && script_of_run != SCX_Common) {
|
|
+
|
|
+ /* Here, the script run isn't Common, but the current digit is in
|
|
+ * Common, and isn't '0'-'9' (those were handled earlier). Only
|
|
+ * '0'-'9' are acceptable in non-Common scripts. */
|
|
+ retval = FALSE;
|
|
+ break;
|
|
+ }
|
|
+ else { /* Otherwise we now have a zero for this run */
|
|
+ zero_of_run = zero_of_char;
|
|
+ }
|
|
} /* end of looping through CLOSESR text */
|
|
|
|
Safefree(intersection);
|
|
diff --git a/t/re/script_run.t b/t/re/script_run.t
|
|
index 10c7103..f8809e3 100644
|
|
--- a/t/re/script_run.t
|
|
+++ b/t/re/script_run.t
|
|
@@ -97,4 +97,9 @@ foreach my $type ('script_run', 'sr', 'atomic_script_run', 'asr') {
|
|
like("abc", qr/(*asr:a[bc]*c)/, "Outer asr works on a run");
|
|
unlike("abc", qr/(*asr:a(*asr:[bc]*)c)/, "Nested asr works to exclude some things");
|
|
|
|
+ like("\x{0980}12\x{0993}", qr/^(*sr:.{4})/,
|
|
+ "Script with own zero works with ASCII digits"); # perl #133547
|
|
+ like("\x{3041}12\x{3041}", qr/^(*sr:.{4})/,
|
|
+ "Script without own zero works with ASCII digits");
|
|
+
|
|
done_testing();
|
|
--
|
|
2.17.2
|
|
|