pcre2/pcre2-10.30-Fix-PCRE2_FIRSTLINE-bug-when-a-pattern-match-starts-.patch

207 lines
7.7 KiB
Diff
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

From f3b22988611cca57770a705f05c0d9ef583d605a Mon Sep 17 00:00:00 2001
From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
Date: Mon, 1 Jan 2018 14:12:35 +0000
Subject: [PATCH] Fix PCRE2_FIRSTLINE bug when a pattern match starts with the
first code unit of a newline sequence.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@899 6239d852-aaf2-0410-a92c-79f79f948069
Petr Písař: Ported to 10.32
Signed-off-by: Petr Písař <ppisar@redhat.com>
---
src/pcre2_dfa_match.c | 21 +++++++++++++++------
src/pcre2_match.c | 19 ++++++++++++++-----
testdata/testinput2 | 10 ++++++++++
testdata/testinput6 | 10 ++++++++++
testdata/testoutput2 | 13 +++++++++++++
testdata/testoutput6 | 13 +++++++++++++
6 files changed, 75 insertions(+), 11 deletions(-)
diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c
index 5ae1394..7bbd6d3 100644
--- a/src/pcre2_dfa_match.c
+++ b/src/pcre2_dfa_match.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
- New API code Copyright (c) 2016-2017 University of Cambridge
+ New API code Copyright (c) 2016-2018 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -3558,9 +3558,11 @@ for (;;)
/* If firstline is TRUE, the start of the match is constrained to the first
line of a multiline string. That is, the match must be before or at the
- first newline. Implement this by temporarily adjusting end_subject so that
- we stop the optimization scans for a first code unit at a newline. If the
- match fails at the newline, later code breaks this loop. */
+ first newline following the start of matching. Temporarily adjust
+ end_subject so that we stop the optimization scans for a first code unit
+ immediately after the first character of a newline (the first code unit can
+ legitimately be a newline). If the match fails at the newline, later code
+ breaks this loop. */
if (firstline)
{
@@ -3568,7 +3570,7 @@ for (;;)
#ifdef SUPPORT_UNICODE
if (utf)
{
- while (t < mb->end_subject && !IS_NEWLINE(t))
+ while (t < end_subject && !IS_NEWLINE(t))
{
t++;
ACROSSCHAR(t < end_subject, *t, t++);
@@ -3576,7 +3578,14 @@ for (;;)
}
else
#endif
- while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
+ while (t < end_subject && !IS_NEWLINE(t)) t++;
+
+ /* Note that we only need to advance by one code unit if we found a
+ newline. If the newline is CRLF, a first code unit of LF should not
+ match, because it is not at or before the newline. Similarly, only the
+ first code unit of a Unicode newline might be relevant. */
+
+ if (t < end_subject) t++;
end_subject = t;
}
diff --git a/src/pcre2_match.c b/src/pcre2_match.c
index 050b7e9..8afb0d8 100644
--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@@ -6541,9 +6541,11 @@ for(;;)
/* If firstline is TRUE, the start of the match is constrained to the first
line of a multiline string. That is, the match must be before or at the
- first newline. Implement this by temporarily adjusting end_subject so that
- we stop the optimization scans for a first code unit at a newline. If the
- match fails at the newline, later code breaks this loop. */
+ first newline following the start of matching. Temporarily adjust
+ end_subject so that we stop the optimization scans for a first code unit
+ immediately after the first character of a newline (the first code unit can
+ legitimately be a newline). If the match fails at the newline, later code
+ breaks this loop. */
if (firstline)
{
@@ -6551,7 +6553,7 @@ for(;;)
#ifdef SUPPORT_UNICODE
if (utf)
{
- while (t < mb->end_subject && !IS_NEWLINE(t))
+ while (t < end_subject && !IS_NEWLINE(t))
{
t++;
ACROSSCHAR(t < end_subject, *t, t++);
@@ -6559,7 +6561,14 @@ for(;;)
}
else
#endif
- while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
+ while (t < end_subject && !IS_NEWLINE(t)) t++;
+
+ /* Note that we only need to advance by one code unit if we found a
+ newline. If the newline is CRLF, a first code unit of LF should not
+ match, because it is not at or before the newline. Similarly, only the
+ first code unit of a Unicode newline might be relevant. */
+
+ if (t < end_subject) t++;
end_subject = t;
}
diff --git a/testdata/testinput2 b/testdata/testinput2
index 695f0a4..b173fe0 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -5385,4 +5385,14 @@ a)"xI
ab
aaab
+/\n/firstline
+ xyz\nabc
+
+/\nabc/firstline
+ xyz\nabc
+
+/\x{0a}abc/firstline,newline=crlf
+\= Expect no match
+ xyz\r\nabc
+
# End of testinput2
diff --git a/testdata/testinput6 b/testdata/testinput6
index ce2e082..614c3a0 100644
--- a/testdata/testinput6
+++ b/testdata/testinput6
@@ -4932,4 +4932,14 @@
/(*LIMIT_MATCH=100).*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00 \x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););/no_dotstar_anchor
.*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00 \x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););
+/\n/firstline
+ xyz\nabc
+
+/\nabc/firstline
+ xyz\nabc
+
+/\x{0a}abc/firstline,newline=crlf
+\= Expect no match
+ xyz\r\nabc
+
# End of testinput6
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 31ccfbe..c19c270 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -16368,6 +16368,19 @@ Subject length lower bound = 1
0: ab
1: a
+/\n/firstline
+ xyz\nabc
+ 0: \x0a
+
+/\nabc/firstline
+ xyz\nabc
+ 0: \x0aabc
+
+/\x{0a}abc/firstline,newline=crlf
+\= Expect no match
+ xyz\r\nabc
+No match
+
# End of testinput2
Error -65: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data
diff --git a/testdata/testoutput6 b/testdata/testoutput6
index b912944..2d321d5 100644
--- a/testdata/testoutput6
+++ b/testdata/testoutput6
@@ -7753,4 +7753,17 @@ No match
.*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00 \x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););
Failed: error -47: match limit exceeded
+/\n/firstline
+ xyz\nabc
+ 0: \x0a
+
+/\nabc/firstline
+ xyz\nabc
+ 0: \x0aabc
+
+/\x{0a}abc/firstline,newline=crlf
+\= Expect no match
+ xyz\r\nabc
+No match
+
# End of testinput6
--
2.13.6