diff --git a/perl-5.29.6-PATCH-perl-133756-Failure-to-match-properly.patch b/perl-5.29.6-PATCH-perl-133756-Failure-to-match-properly.patch new file mode 100644 index 0000000..ddb1870 --- /dev/null +++ b/perl-5.29.6-PATCH-perl-133756-Failure-to-match-properly.patch @@ -0,0 +1,174 @@ +From 739762602079d68c47cbde3b29275e844d0c92ef Mon Sep 17 00:00:00 2001 +From: Karl Williamson +Date: Thu, 10 Jan 2019 20:59:31 -0700 +Subject: [PATCH] PATCH: [perl #133756] Failure to match properly +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This was caused by a counting error. + +An EXACTFish regnode has a finite length it can hold for the string +being matched. If that length is exceeded, a 2nd node is used for the +next segment of the string, for as many regnodes as are needed. + +A problem occurs if a regnode ends with one of the 22 characters in +Unicode 11 that occur in non-final positions of a multi-character fold. +The design of the pattern matching engine doesn't allow matches across +regnodes. Consider, for example if a node ended in the letter 'f' and +the next node begins with the letter 'i'. That sequence should match, +under /i, the ligature "fi" (U+FB01). But it wouldn't because the +pattern splits them across nodes. The solution I adopted was to forbid +a node to end with one of those 22 characters if there is another string +node that follows it. This is not fool proof, for example, if the +entire node consisted of only these characters, one would have to split +it at some position (In that case, we just take as much of the string as +will fit.) But for real life applications, it is good enough. + +What happens if a node ends with one of the 22, is that the node is +shortened so that those are instead placed at the beginning of the +following node. When the code encounters this situation, it backs off +until it finds a character that isn't a non-final fold one, and closes +the node with that one. + +A /i node is filled with the fold of the input, for several reasons. +The most obvious is that it saves time, you can skip folding the pattern +at runtime. But there are reasons based on the design of the optimzer +as well, which I won't go into here, but are documented in regcomp.c. +When we back out the final characters in a node, we also have to back +out the corresponding unfolded characters in the input, so that those +can be (folded) into the following node. Since the number of characters +in the fold may not be the same as unfolded, there is not an easily +discernable correspondence between the input and the folded output. +That means that generally, what has to be done is that the input is +reparsed from the beginning of the node, but the permitted length has +been shortened (we know precisely how much to shorten it to) so that it +will end with something other than the 22. But, the code saves the +previous input character's position (for other reasons), so if we only +have to backup one character, we can just use that and not have to +reparse. + +This bug was that the code thought a two character backup was really a +one character one, and did not reparse the node, creating an off-by-one +error, and a character was simply omitted in the pattern (that should +have started the following node). And the input had two of the 22 +characters adjacent to each other in just the right positions that the +node was split. The bisect showed that when the node size was changed +the bug went away, at least for this particular input string. But a +different, longer, string would have triggered the bug, and this commit +fixes that. + +This bug is actually very unlikely to occur in most real world +applications. That is because other changes in the regex compiler have +caused nodes to be split so that things that don't particpate in folds +at all are separated out into EXACT nodes. (The reason for that is it +allows the optimizer things to grab on to under /i that it wouldn't +otherwise have known about.) That means that anything like this string +would never cause the bug to happen because blanks and commas, etc. +would be in separate nodes, and so no node would ever get large enough +to fill the 238 available byte slots in a node (235 on EBCDIC). Only a +long string without punctuation would trigger it. I have artificially +constructed such a string in the tests added by this commit. + +One of the 22 characters is 't', so long strings of DNA "ACTG" could +trigger this bug. I find it somewhat amusing that this is something +like a DNA transcription error, which occurs in nature at very low +rates, but selection, it is believed, will make sure the error rate is +above zero. + +Signed-off-by: Petr Písař +--- + regcomp.c | 15 ++++++++++----- + t/re/pat_advanced.t | 27 +++++++++++++++++++++++++++ + 2 files changed, 37 insertions(+), 5 deletions(-) + +diff --git a/regcomp.c b/regcomp.c +index 56b83f964a..58cb941b06 100644 +--- a/regcomp.c ++++ b/regcomp.c +@@ -14377,6 +14377,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) + * identifies, so when it is set to less than the full node, we can + * skip the rest of this */ + if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) { ++ PERL_UINT_FAST8_T backup_count = 0; + + const STRLEN full_len = len; + +@@ -14393,7 +14394,9 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) + goto loopdone; + } + +- while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { } ++ while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { ++ backup_count++; ++ } + len = s - s0 + 1; + } + else { +@@ -14435,6 +14438,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) + * special case the very first byte in the string, so + * we don't read outside the string */ + s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1); ++ backup_count++; + } /* End of loop backwards through the string */ + + /* If there were only problematic characters in the string, +@@ -14458,12 +14462,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) + } else { + + /* Here, the node does contain some characters that aren't +- * problematic. If one such is the final character in the +- * node, we are done */ +- if (len == full_len) { ++ * problematic. If we didn't have to backup any, then the ++ * final character in the node is non-problematic, and we ++ * can take the node as-is */ ++ if (backup_count == 0) { + goto loopdone; + } +- else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) { ++ else if (backup_count == 1) { + + /* If the final character is problematic, but the + * penultimate is not, back-off that last character to +diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t +index d90ceeb5bd..ade8b1587a 100644 +--- a/t/re/pat_advanced.t ++++ b/t/re/pat_advanced.t +@@ -2176,6 +2176,33 @@ EOP + } + } + ok(! $failed, "Matched multi-char fold 'ss' across EXACTF node boundaries; if failed, was at count $failed"); ++ ++ for my $non_finals ("t", "ft", "ift", "sift") { ++ my $base_pat = $non_finals . "enKalt"; # (The tail is taken from ++ # the trouble ticket, is ++ # arbitrary) ++ for my $utf8 ("non-UTF-8", "UTF-8") { ++ ++ # Try at different lengths to be sure to get a node boundary ++ for my $repeat (120 .. 270) { # [perl #133756] ++ my $head = ("b" x $repeat) . "\xDC"; ++ my $pat = $base_pat; ++ utf8::upgrade($pat) if $utf8 eq "UTF-8"; ++ $pat = $head . $pat; ++ my $text = $head . $base_pat; ++ ++ if ($text !~ /$pat/i) { ++ $failed = $repeat; ++ last; ++ } ++ } ++ ++ ok(! $failed, "A non-final fold character " ++ . (length($non_finals) - 1) ++ . " characters from the end of an EXACTFish" ++ . " $utf8 pattern works; if failed, was at count $failed"); ++ } ++ } + } + + { +-- +2.17.2 + diff --git a/perl.spec b/perl.spec index e930b8a..1a0fa79 100644 --- a/perl.spec +++ b/perl.spec @@ -243,6 +243,10 @@ Patch48: perl-5.29.6-regen-warnings.pl-Fix-undefined-C-behavior.patch # in upstream after 5.29.6 Patch49: perl-5.29.6-perl-133575-prevent-set-longjmp-clobbering-locals-in.patch +# Fix a mismatch with a case-insesitive regular expression on a text with +# ligatures, RT#133756, in upstream after 5.29.6 +Patch50: perl-5.29.6-PATCH-perl-133756-Failure-to-match-properly.patch + # Link XS modules to libperl.so with EU::CBuilder on Linux, bug #960048 Patch200: perl-5.16.3-Link-XS-modules-to-libperl.so-with-EU-CBuilder-on-Li.patch @@ -2846,6 +2850,7 @@ Perl extension for Version Objects %patch47 -p1 %patch48 -p1 %patch49 -p1 +%patch50 -p1 %patch200 -p1 %patch201 -p1 @@ -2889,6 +2894,7 @@ perl -x patchlevel.h \ 'Fedora Patch47: Fix a crash when compiling a malformed form (RT#132158)' \ 'Fedora Patch48: Fix un undefined C behavior in NULL pointer arithmetics (RT#133223)' \ 'Fedora Patch49: Prevent long jumps from clobbering local variables (RT#133575)' \ + 'Fedora Patch50: Fix a mismatch with a case-insesitive regular expression on a text with ligatures (RT#133756)' \ 'Fedora Patch200: Link XS modules to libperl.so with EU::CBuilder on Linux' \ 'Fedora Patch201: Link XS modules to libperl.so with EU::MM on Linux' \ %{nil} @@ -5185,6 +5191,8 @@ popd - Fix a crash when compiling a malformed form (RT#132158) - Fix un undefined C behavior in NULL pointer arithmetics (RT#133223) - Prevent long jumps from clobbering local variables (RT#133575) +- Fix a mismatch with a case-insesitive regular expression on a text with ligatures + (RT#133756) * Mon Jan 14 2019 Björn Esser - 4:5.28.1-429 - Rebuilt for libcrypt.so.2 (#1666033)