207 lines
7.7 KiB
Diff
207 lines
7.7 KiB
Diff
|
From f3b22988611cca57770a705f05c0d9ef583d605a Mon Sep 17 00:00:00 2001
|
|||
|
From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
|
|||
|
Date: Mon, 1 Jan 2018 14:12:35 +0000
|
|||
|
Subject: [PATCH] Fix PCRE2_FIRSTLINE bug when a pattern match starts with the
|
|||
|
first code unit of a newline sequence.
|
|||
|
MIME-Version: 1.0
|
|||
|
Content-Type: text/plain; charset=UTF-8
|
|||
|
Content-Transfer-Encoding: 8bit
|
|||
|
|
|||
|
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@899 6239d852-aaf2-0410-a92c-79f79f948069
|
|||
|
Petr Písař: Ported to 10.32
|
|||
|
|
|||
|
Signed-off-by: Petr Písař <ppisar@redhat.com>
|
|||
|
---
|
|||
|
src/pcre2_dfa_match.c | 21 +++++++++++++++------
|
|||
|
src/pcre2_match.c | 19 ++++++++++++++-----
|
|||
|
testdata/testinput2 | 10 ++++++++++
|
|||
|
testdata/testinput6 | 10 ++++++++++
|
|||
|
testdata/testoutput2 | 13 +++++++++++++
|
|||
|
testdata/testoutput6 | 13 +++++++++++++
|
|||
|
6 files changed, 75 insertions(+), 11 deletions(-)
|
|||
|
|
|||
|
diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c
|
|||
|
index 5ae1394..7bbd6d3 100644
|
|||
|
--- a/src/pcre2_dfa_match.c
|
|||
|
+++ b/src/pcre2_dfa_match.c
|
|||
|
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
|||
|
|
|||
|
Written by Philip Hazel
|
|||
|
Original API code Copyright (c) 1997-2012 University of Cambridge
|
|||
|
- New API code Copyright (c) 2016-2017 University of Cambridge
|
|||
|
+ New API code Copyright (c) 2016-2018 University of Cambridge
|
|||
|
|
|||
|
-----------------------------------------------------------------------------
|
|||
|
Redistribution and use in source and binary forms, with or without
|
|||
|
@@ -3558,9 +3558,11 @@ for (;;)
|
|||
|
|
|||
|
/* If firstline is TRUE, the start of the match is constrained to the first
|
|||
|
line of a multiline string. That is, the match must be before or at the
|
|||
|
- first newline. Implement this by temporarily adjusting end_subject so that
|
|||
|
- we stop the optimization scans for a first code unit at a newline. If the
|
|||
|
- match fails at the newline, later code breaks this loop. */
|
|||
|
+ first newline following the start of matching. Temporarily adjust
|
|||
|
+ end_subject so that we stop the optimization scans for a first code unit
|
|||
|
+ immediately after the first character of a newline (the first code unit can
|
|||
|
+ legitimately be a newline). If the match fails at the newline, later code
|
|||
|
+ breaks this loop. */
|
|||
|
|
|||
|
if (firstline)
|
|||
|
{
|
|||
|
@@ -3568,7 +3570,7 @@ for (;;)
|
|||
|
#ifdef SUPPORT_UNICODE
|
|||
|
if (utf)
|
|||
|
{
|
|||
|
- while (t < mb->end_subject && !IS_NEWLINE(t))
|
|||
|
+ while (t < end_subject && !IS_NEWLINE(t))
|
|||
|
{
|
|||
|
t++;
|
|||
|
ACROSSCHAR(t < end_subject, *t, t++);
|
|||
|
@@ -3576,7 +3578,14 @@ for (;;)
|
|||
|
}
|
|||
|
else
|
|||
|
#endif
|
|||
|
- while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
|
|||
|
+ while (t < end_subject && !IS_NEWLINE(t)) t++;
|
|||
|
+
|
|||
|
+ /* Note that we only need to advance by one code unit if we found a
|
|||
|
+ newline. If the newline is CRLF, a first code unit of LF should not
|
|||
|
+ match, because it is not at or before the newline. Similarly, only the
|
|||
|
+ first code unit of a Unicode newline might be relevant. */
|
|||
|
+
|
|||
|
+ if (t < end_subject) t++;
|
|||
|
end_subject = t;
|
|||
|
}
|
|||
|
|
|||
|
diff --git a/src/pcre2_match.c b/src/pcre2_match.c
|
|||
|
index 050b7e9..8afb0d8 100644
|
|||
|
--- a/src/pcre2_match.c
|
|||
|
+++ b/src/pcre2_match.c
|
|||
|
@@ -6541,9 +6541,11 @@ for(;;)
|
|||
|
|
|||
|
/* If firstline is TRUE, the start of the match is constrained to the first
|
|||
|
line of a multiline string. That is, the match must be before or at the
|
|||
|
- first newline. Implement this by temporarily adjusting end_subject so that
|
|||
|
- we stop the optimization scans for a first code unit at a newline. If the
|
|||
|
- match fails at the newline, later code breaks this loop. */
|
|||
|
+ first newline following the start of matching. Temporarily adjust
|
|||
|
+ end_subject so that we stop the optimization scans for a first code unit
|
|||
|
+ immediately after the first character of a newline (the first code unit can
|
|||
|
+ legitimately be a newline). If the match fails at the newline, later code
|
|||
|
+ breaks this loop. */
|
|||
|
|
|||
|
if (firstline)
|
|||
|
{
|
|||
|
@@ -6551,7 +6553,7 @@ for(;;)
|
|||
|
#ifdef SUPPORT_UNICODE
|
|||
|
if (utf)
|
|||
|
{
|
|||
|
- while (t < mb->end_subject && !IS_NEWLINE(t))
|
|||
|
+ while (t < end_subject && !IS_NEWLINE(t))
|
|||
|
{
|
|||
|
t++;
|
|||
|
ACROSSCHAR(t < end_subject, *t, t++);
|
|||
|
@@ -6559,7 +6561,14 @@ for(;;)
|
|||
|
}
|
|||
|
else
|
|||
|
#endif
|
|||
|
- while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
|
|||
|
+ while (t < end_subject && !IS_NEWLINE(t)) t++;
|
|||
|
+
|
|||
|
+ /* Note that we only need to advance by one code unit if we found a
|
|||
|
+ newline. If the newline is CRLF, a first code unit of LF should not
|
|||
|
+ match, because it is not at or before the newline. Similarly, only the
|
|||
|
+ first code unit of a Unicode newline might be relevant. */
|
|||
|
+
|
|||
|
+ if (t < end_subject) t++;
|
|||
|
end_subject = t;
|
|||
|
}
|
|||
|
|
|||
|
diff --git a/testdata/testinput2 b/testdata/testinput2
|
|||
|
index 695f0a4..b173fe0 100644
|
|||
|
--- a/testdata/testinput2
|
|||
|
+++ b/testdata/testinput2
|
|||
|
@@ -5385,4 +5385,14 @@ a)"xI
|
|||
|
ab
|
|||
|
aaab
|
|||
|
|
|||
|
+/\n/firstline
|
|||
|
+ xyz\nabc
|
|||
|
+
|
|||
|
+/\nabc/firstline
|
|||
|
+ xyz\nabc
|
|||
|
+
|
|||
|
+/\x{0a}abc/firstline,newline=crlf
|
|||
|
+\= Expect no match
|
|||
|
+ xyz\r\nabc
|
|||
|
+
|
|||
|
# End of testinput2
|
|||
|
diff --git a/testdata/testinput6 b/testdata/testinput6
|
|||
|
index ce2e082..614c3a0 100644
|
|||
|
--- a/testdata/testinput6
|
|||
|
+++ b/testdata/testinput6
|
|||
|
@@ -4932,4 +4932,14 @@
|
|||
|
/(*LIMIT_MATCH=100).*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00\x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););/no_dotstar_anchor
|
|||
|
.*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00\x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););
|
|||
|
|
|||
|
+/\n/firstline
|
|||
|
+ xyz\nabc
|
|||
|
+
|
|||
|
+/\nabc/firstline
|
|||
|
+ xyz\nabc
|
|||
|
+
|
|||
|
+/\x{0a}abc/firstline,newline=crlf
|
|||
|
+\= Expect no match
|
|||
|
+ xyz\r\nabc
|
|||
|
+
|
|||
|
# End of testinput6
|
|||
|
diff --git a/testdata/testoutput2 b/testdata/testoutput2
|
|||
|
index 31ccfbe..c19c270 100644
|
|||
|
--- a/testdata/testoutput2
|
|||
|
+++ b/testdata/testoutput2
|
|||
|
@@ -16368,6 +16368,19 @@ Subject length lower bound = 1
|
|||
|
0: ab
|
|||
|
1: a
|
|||
|
|
|||
|
+/\n/firstline
|
|||
|
+ xyz\nabc
|
|||
|
+ 0: \x0a
|
|||
|
+
|
|||
|
+/\nabc/firstline
|
|||
|
+ xyz\nabc
|
|||
|
+ 0: \x0aabc
|
|||
|
+
|
|||
|
+/\x{0a}abc/firstline,newline=crlf
|
|||
|
+\= Expect no match
|
|||
|
+ xyz\r\nabc
|
|||
|
+No match
|
|||
|
+
|
|||
|
# End of testinput2
|
|||
|
Error -65: PCRE2_ERROR_BADDATA (unknown error number)
|
|||
|
Error -62: bad serialized data
|
|||
|
diff --git a/testdata/testoutput6 b/testdata/testoutput6
|
|||
|
index b912944..2d321d5 100644
|
|||
|
--- a/testdata/testoutput6
|
|||
|
+++ b/testdata/testoutput6
|
|||
|
@@ -7753,4 +7753,17 @@ No match
|
|||
|
.*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00\x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););
|
|||
|
Failed: error -47: match limit exceeded
|
|||
|
|
|||
|
+/\n/firstline
|
|||
|
+ xyz\nabc
|
|||
|
+ 0: \x0a
|
|||
|
+
|
|||
|
+/\nabc/firstline
|
|||
|
+ xyz\nabc
|
|||
|
+ 0: \x0aabc
|
|||
|
+
|
|||
|
+/\x{0a}abc/firstline,newline=crlf
|
|||
|
+\= Expect no match
|
|||
|
+ xyz\r\nabc
|
|||
|
+No match
|
|||
|
+
|
|||
|
# End of testinput6
|
|||
|
--
|
|||
|
2.13.6
|
|||
|
|