pcre2/pcre2-10.30-Fix-PCRE2_FIRSTLINE-bug-when-a-pattern-match-starts-.patch

207 lines
7.7 KiB
Diff
Raw Normal View History

From f3b22988611cca57770a705f05c0d9ef583d605a Mon Sep 17 00:00:00 2001
From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
Date: Mon, 1 Jan 2018 14:12:35 +0000
Subject: [PATCH] Fix PCRE2_FIRSTLINE bug when a pattern match starts with the
first code unit of a newline sequence.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@899 6239d852-aaf2-0410-a92c-79f79f948069
Petr Písař: Ported to 10.32
Signed-off-by: Petr Písař <ppisar@redhat.com>
---
src/pcre2_dfa_match.c | 21 +++++++++++++++------
src/pcre2_match.c | 19 ++++++++++++++-----
testdata/testinput2 | 10 ++++++++++
testdata/testinput6 | 10 ++++++++++
testdata/testoutput2 | 13 +++++++++++++
testdata/testoutput6 | 13 +++++++++++++
6 files changed, 75 insertions(+), 11 deletions(-)
diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c
index 5ae1394..7bbd6d3 100644
--- a/src/pcre2_dfa_match.c
+++ b/src/pcre2_dfa_match.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
- New API code Copyright (c) 2016-2017 University of Cambridge
+ New API code Copyright (c) 2016-2018 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -3558,9 +3558,11 @@ for (;;)
/* If firstline is TRUE, the start of the match is constrained to the first
line of a multiline string. That is, the match must be before or at the
- first newline. Implement this by temporarily adjusting end_subject so that
- we stop the optimization scans for a first code unit at a newline. If the
- match fails at the newline, later code breaks this loop. */
+ first newline following the start of matching. Temporarily adjust
+ end_subject so that we stop the optimization scans for a first code unit
+ immediately after the first character of a newline (the first code unit can
+ legitimately be a newline). If the match fails at the newline, later code
+ breaks this loop. */
if (firstline)
{
@@ -3568,7 +3570,7 @@ for (;;)
#ifdef SUPPORT_UNICODE
if (utf)
{
- while (t < mb->end_subject && !IS_NEWLINE(t))
+ while (t < end_subject && !IS_NEWLINE(t))
{
t++;
ACROSSCHAR(t < end_subject, *t, t++);
@@ -3576,7 +3578,14 @@ for (;;)
}
else
#endif
- while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
+ while (t < end_subject && !IS_NEWLINE(t)) t++;
+
+ /* Note that we only need to advance by one code unit if we found a
+ newline. If the newline is CRLF, a first code unit of LF should not
+ match, because it is not at or before the newline. Similarly, only the
+ first code unit of a Unicode newline might be relevant. */
+
+ if (t < end_subject) t++;
end_subject = t;
}
diff --git a/src/pcre2_match.c b/src/pcre2_match.c
index 050b7e9..8afb0d8 100644
--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@@ -6541,9 +6541,11 @@ for(;;)
/* If firstline is TRUE, the start of the match is constrained to the first
line of a multiline string. That is, the match must be before or at the
- first newline. Implement this by temporarily adjusting end_subject so that
- we stop the optimization scans for a first code unit at a newline. If the
- match fails at the newline, later code breaks this loop. */
+ first newline following the start of matching. Temporarily adjust
+ end_subject so that we stop the optimization scans for a first code unit
+ immediately after the first character of a newline (the first code unit can
+ legitimately be a newline). If the match fails at the newline, later code
+ breaks this loop. */
if (firstline)
{
@@ -6551,7 +6553,7 @@ for(;;)
#ifdef SUPPORT_UNICODE
if (utf)
{
- while (t < mb->end_subject && !IS_NEWLINE(t))
+ while (t < end_subject && !IS_NEWLINE(t))
{
t++;
ACROSSCHAR(t < end_subject, *t, t++);
@@ -6559,7 +6561,14 @@ for(;;)
}
else
#endif
- while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
+ while (t < end_subject && !IS_NEWLINE(t)) t++;
+
+ /* Note that we only need to advance by one code unit if we found a
+ newline. If the newline is CRLF, a first code unit of LF should not
+ match, because it is not at or before the newline. Similarly, only the
+ first code unit of a Unicode newline might be relevant. */
+
+ if (t < end_subject) t++;
end_subject = t;
}
diff --git a/testdata/testinput2 b/testdata/testinput2
index 695f0a4..b173fe0 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -5385,4 +5385,14 @@ a)"xI
ab
aaab
+/\n/firstline
+ xyz\nabc
+
+/\nabc/firstline
+ xyz\nabc
+
+/\x{0a}abc/firstline,newline=crlf
+\= Expect no match
+ xyz\r\nabc
+
# End of testinput2
diff --git a/testdata/testinput6 b/testdata/testinput6
index ce2e082..614c3a0 100644
--- a/testdata/testinput6
+++ b/testdata/testinput6
@@ -4932,4 +4932,14 @@
/(*LIMIT_MATCH=100).*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00 \x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););/no_dotstar_anchor
.*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00 \x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););
+/\n/firstline
+ xyz\nabc
+
+/\nabc/firstline
+ xyz\nabc
+
+/\x{0a}abc/firstline,newline=crlf
+\= Expect no match
+ xyz\r\nabc
+
# End of testinput6
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 31ccfbe..c19c270 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -16368,6 +16368,19 @@ Subject length lower bound = 1
0: ab
1: a
+/\n/firstline
+ xyz\nabc
+ 0: \x0a
+
+/\nabc/firstline
+ xyz\nabc
+ 0: \x0aabc
+
+/\x{0a}abc/firstline,newline=crlf
+\= Expect no match
+ xyz\r\nabc
+No match
+
# End of testinput2
Error -65: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data
diff --git a/testdata/testoutput6 b/testdata/testoutput6
index b912944..2d321d5 100644
--- a/testdata/testoutput6
+++ b/testdata/testoutput6
@@ -7753,4 +7753,17 @@ No match
.*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00 \x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););
Failed: error -47: match limit exceeded
+/\n/firstline
+ xyz\nabc
+ 0: \x0a
+
+/\nabc/firstline
+ xyz\nabc
+ 0: \x0aabc
+
+/\x{0a}abc/firstline,newline=crlf
+\= Expect no match
+ xyz\r\nabc
+No match
+
# End of testinput6
--
2.13.6