175 lines
8.3 KiB
Diff
175 lines
8.3 KiB
Diff
From 739762602079d68c47cbde3b29275e844d0c92ef Mon Sep 17 00:00:00 2001
|
|
From: Karl Williamson <khw@cpan.org>
|
|
Date: Thu, 10 Jan 2019 20:59:31 -0700
|
|
Subject: [PATCH] PATCH: [perl #133756] Failure to match properly
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
This was caused by a counting error.
|
|
|
|
An EXACTFish regnode has a finite length it can hold for the string
|
|
being matched. If that length is exceeded, a 2nd node is used for the
|
|
next segment of the string, for as many regnodes as are needed.
|
|
|
|
A problem occurs if a regnode ends with one of the 22 characters in
|
|
Unicode 11 that occur in non-final positions of a multi-character fold.
|
|
The design of the pattern matching engine doesn't allow matches across
|
|
regnodes. Consider, for example if a node ended in the letter 'f' and
|
|
the next node begins with the letter 'i'. That sequence should match,
|
|
under /i, the ligature "fi" (U+FB01). But it wouldn't because the
|
|
pattern splits them across nodes. The solution I adopted was to forbid
|
|
a node to end with one of those 22 characters if there is another string
|
|
node that follows it. This is not fool proof, for example, if the
|
|
entire node consisted of only these characters, one would have to split
|
|
it at some position (In that case, we just take as much of the string as
|
|
will fit.) But for real life applications, it is good enough.
|
|
|
|
What happens if a node ends with one of the 22, is that the node is
|
|
shortened so that those are instead placed at the beginning of the
|
|
following node. When the code encounters this situation, it backs off
|
|
until it finds a character that isn't a non-final fold one, and closes
|
|
the node with that one.
|
|
|
|
A /i node is filled with the fold of the input, for several reasons.
|
|
The most obvious is that it saves time, you can skip folding the pattern
|
|
at runtime. But there are reasons based on the design of the optimzer
|
|
as well, which I won't go into here, but are documented in regcomp.c.
|
|
When we back out the final characters in a node, we also have to back
|
|
out the corresponding unfolded characters in the input, so that those
|
|
can be (folded) into the following node. Since the number of characters
|
|
in the fold may not be the same as unfolded, there is not an easily
|
|
discernable correspondence between the input and the folded output.
|
|
That means that generally, what has to be done is that the input is
|
|
reparsed from the beginning of the node, but the permitted length has
|
|
been shortened (we know precisely how much to shorten it to) so that it
|
|
will end with something other than the 22. But, the code saves the
|
|
previous input character's position (for other reasons), so if we only
|
|
have to backup one character, we can just use that and not have to
|
|
reparse.
|
|
|
|
This bug was that the code thought a two character backup was really a
|
|
one character one, and did not reparse the node, creating an off-by-one
|
|
error, and a character was simply omitted in the pattern (that should
|
|
have started the following node). And the input had two of the 22
|
|
characters adjacent to each other in just the right positions that the
|
|
node was split. The bisect showed that when the node size was changed
|
|
the bug went away, at least for this particular input string. But a
|
|
different, longer, string would have triggered the bug, and this commit
|
|
fixes that.
|
|
|
|
This bug is actually very unlikely to occur in most real world
|
|
applications. That is because other changes in the regex compiler have
|
|
caused nodes to be split so that things that don't particpate in folds
|
|
at all are separated out into EXACT nodes. (The reason for that is it
|
|
allows the optimizer things to grab on to under /i that it wouldn't
|
|
otherwise have known about.) That means that anything like this string
|
|
would never cause the bug to happen because blanks and commas, etc.
|
|
would be in separate nodes, and so no node would ever get large enough
|
|
to fill the 238 available byte slots in a node (235 on EBCDIC). Only a
|
|
long string without punctuation would trigger it. I have artificially
|
|
constructed such a string in the tests added by this commit.
|
|
|
|
One of the 22 characters is 't', so long strings of DNA "ACTG" could
|
|
trigger this bug. I find it somewhat amusing that this is something
|
|
like a DNA transcription error, which occurs in nature at very low
|
|
rates, but selection, it is believed, will make sure the error rate is
|
|
above zero.
|
|
|
|
Signed-off-by: Petr Písař <ppisar@redhat.com>
|
|
---
|
|
regcomp.c | 15 ++++++++++-----
|
|
t/re/pat_advanced.t | 27 +++++++++++++++++++++++++++
|
|
2 files changed, 37 insertions(+), 5 deletions(-)
|
|
|
|
diff --git a/regcomp.c b/regcomp.c
|
|
index 56b83f964a..58cb941b06 100644
|
|
--- a/regcomp.c
|
|
+++ b/regcomp.c
|
|
@@ -14377,6 +14377,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
|
|
* identifies, so when it is set to less than the full node, we can
|
|
* skip the rest of this */
|
|
if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
|
|
+ PERL_UINT_FAST8_T backup_count = 0;
|
|
|
|
const STRLEN full_len = len;
|
|
|
|
@@ -14393,7 +14394,9 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
|
|
goto loopdone;
|
|
}
|
|
|
|
- while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
|
|
+ while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) {
|
|
+ backup_count++;
|
|
+ }
|
|
len = s - s0 + 1;
|
|
}
|
|
else {
|
|
@@ -14435,6 +14438,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
|
|
* special case the very first byte in the string, so
|
|
* we don't read outside the string */
|
|
s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
|
|
+ backup_count++;
|
|
} /* End of loop backwards through the string */
|
|
|
|
/* If there were only problematic characters in the string,
|
|
@@ -14458,12 +14462,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
|
|
} else {
|
|
|
|
/* Here, the node does contain some characters that aren't
|
|
- * problematic. If one such is the final character in the
|
|
- * node, we are done */
|
|
- if (len == full_len) {
|
|
+ * problematic. If we didn't have to backup any, then the
|
|
+ * final character in the node is non-problematic, and we
|
|
+ * can take the node as-is */
|
|
+ if (backup_count == 0) {
|
|
goto loopdone;
|
|
}
|
|
- else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
|
|
+ else if (backup_count == 1) {
|
|
|
|
/* If the final character is problematic, but the
|
|
* penultimate is not, back-off that last character to
|
|
diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t
|
|
index d90ceeb5bd..ade8b1587a 100644
|
|
--- a/t/re/pat_advanced.t
|
|
+++ b/t/re/pat_advanced.t
|
|
@@ -2176,6 +2176,33 @@ EOP
|
|
}
|
|
}
|
|
ok(! $failed, "Matched multi-char fold 'ss' across EXACTF node boundaries; if failed, was at count $failed");
|
|
+
|
|
+ for my $non_finals ("t", "ft", "ift", "sift") {
|
|
+ my $base_pat = $non_finals . "enKalt"; # (The tail is taken from
|
|
+ # the trouble ticket, is
|
|
+ # arbitrary)
|
|
+ for my $utf8 ("non-UTF-8", "UTF-8") {
|
|
+
|
|
+ # Try at different lengths to be sure to get a node boundary
|
|
+ for my $repeat (120 .. 270) { # [perl #133756]
|
|
+ my $head = ("b" x $repeat) . "\xDC";
|
|
+ my $pat = $base_pat;
|
|
+ utf8::upgrade($pat) if $utf8 eq "UTF-8";
|
|
+ $pat = $head . $pat;
|
|
+ my $text = $head . $base_pat;
|
|
+
|
|
+ if ($text !~ /$pat/i) {
|
|
+ $failed = $repeat;
|
|
+ last;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ ok(! $failed, "A non-final fold character "
|
|
+ . (length($non_finals) - 1)
|
|
+ . " characters from the end of an EXACTFish"
|
|
+ . " $utf8 pattern works; if failed, was at count $failed");
|
|
+ }
|
|
+ }
|
|
}
|
|
|
|
{
|
|
--
|
|
2.17.2
|
|
|