207 lines
7.7 KiB
Diff
207 lines
7.7 KiB
Diff
From f3b22988611cca57770a705f05c0d9ef583d605a Mon Sep 17 00:00:00 2001
|
||
From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
|
||
Date: Mon, 1 Jan 2018 14:12:35 +0000
|
||
Subject: [PATCH] Fix PCRE2_FIRSTLINE bug when a pattern match starts with the
|
||
first code unit of a newline sequence.
|
||
MIME-Version: 1.0
|
||
Content-Type: text/plain; charset=UTF-8
|
||
Content-Transfer-Encoding: 8bit
|
||
|
||
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@899 6239d852-aaf2-0410-a92c-79f79f948069
|
||
Petr Písař: Ported to 10.32
|
||
|
||
Signed-off-by: Petr Písař <ppisar@redhat.com>
|
||
---
|
||
src/pcre2_dfa_match.c | 21 +++++++++++++++------
|
||
src/pcre2_match.c | 19 ++++++++++++++-----
|
||
testdata/testinput2 | 10 ++++++++++
|
||
testdata/testinput6 | 10 ++++++++++
|
||
testdata/testoutput2 | 13 +++++++++++++
|
||
testdata/testoutput6 | 13 +++++++++++++
|
||
6 files changed, 75 insertions(+), 11 deletions(-)
|
||
|
||
diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c
|
||
index 5ae1394..7bbd6d3 100644
|
||
--- a/src/pcre2_dfa_match.c
|
||
+++ b/src/pcre2_dfa_match.c
|
||
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
||
|
||
Written by Philip Hazel
|
||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||
- New API code Copyright (c) 2016-2017 University of Cambridge
|
||
+ New API code Copyright (c) 2016-2018 University of Cambridge
|
||
|
||
-----------------------------------------------------------------------------
|
||
Redistribution and use in source and binary forms, with or without
|
||
@@ -3558,9 +3558,11 @@ for (;;)
|
||
|
||
/* If firstline is TRUE, the start of the match is constrained to the first
|
||
line of a multiline string. That is, the match must be before or at the
|
||
- first newline. Implement this by temporarily adjusting end_subject so that
|
||
- we stop the optimization scans for a first code unit at a newline. If the
|
||
- match fails at the newline, later code breaks this loop. */
|
||
+ first newline following the start of matching. Temporarily adjust
|
||
+ end_subject so that we stop the optimization scans for a first code unit
|
||
+ immediately after the first character of a newline (the first code unit can
|
||
+ legitimately be a newline). If the match fails at the newline, later code
|
||
+ breaks this loop. */
|
||
|
||
if (firstline)
|
||
{
|
||
@@ -3568,7 +3570,7 @@ for (;;)
|
||
#ifdef SUPPORT_UNICODE
|
||
if (utf)
|
||
{
|
||
- while (t < mb->end_subject && !IS_NEWLINE(t))
|
||
+ while (t < end_subject && !IS_NEWLINE(t))
|
||
{
|
||
t++;
|
||
ACROSSCHAR(t < end_subject, *t, t++);
|
||
@@ -3576,7 +3578,14 @@ for (;;)
|
||
}
|
||
else
|
||
#endif
|
||
- while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
|
||
+ while (t < end_subject && !IS_NEWLINE(t)) t++;
|
||
+
|
||
+ /* Note that we only need to advance by one code unit if we found a
|
||
+ newline. If the newline is CRLF, a first code unit of LF should not
|
||
+ match, because it is not at or before the newline. Similarly, only the
|
||
+ first code unit of a Unicode newline might be relevant. */
|
||
+
|
||
+ if (t < end_subject) t++;
|
||
end_subject = t;
|
||
}
|
||
|
||
diff --git a/src/pcre2_match.c b/src/pcre2_match.c
|
||
index 050b7e9..8afb0d8 100644
|
||
--- a/src/pcre2_match.c
|
||
+++ b/src/pcre2_match.c
|
||
@@ -6541,9 +6541,11 @@ for(;;)
|
||
|
||
/* If firstline is TRUE, the start of the match is constrained to the first
|
||
line of a multiline string. That is, the match must be before or at the
|
||
- first newline. Implement this by temporarily adjusting end_subject so that
|
||
- we stop the optimization scans for a first code unit at a newline. If the
|
||
- match fails at the newline, later code breaks this loop. */
|
||
+ first newline following the start of matching. Temporarily adjust
|
||
+ end_subject so that we stop the optimization scans for a first code unit
|
||
+ immediately after the first character of a newline (the first code unit can
|
||
+ legitimately be a newline). If the match fails at the newline, later code
|
||
+ breaks this loop. */
|
||
|
||
if (firstline)
|
||
{
|
||
@@ -6551,7 +6553,7 @@ for(;;)
|
||
#ifdef SUPPORT_UNICODE
|
||
if (utf)
|
||
{
|
||
- while (t < mb->end_subject && !IS_NEWLINE(t))
|
||
+ while (t < end_subject && !IS_NEWLINE(t))
|
||
{
|
||
t++;
|
||
ACROSSCHAR(t < end_subject, *t, t++);
|
||
@@ -6559,7 +6561,14 @@ for(;;)
|
||
}
|
||
else
|
||
#endif
|
||
- while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
|
||
+ while (t < end_subject && !IS_NEWLINE(t)) t++;
|
||
+
|
||
+ /* Note that we only need to advance by one code unit if we found a
|
||
+ newline. If the newline is CRLF, a first code unit of LF should not
|
||
+ match, because it is not at or before the newline. Similarly, only the
|
||
+ first code unit of a Unicode newline might be relevant. */
|
||
+
|
||
+ if (t < end_subject) t++;
|
||
end_subject = t;
|
||
}
|
||
|
||
diff --git a/testdata/testinput2 b/testdata/testinput2
|
||
index 695f0a4..b173fe0 100644
|
||
--- a/testdata/testinput2
|
||
+++ b/testdata/testinput2
|
||
@@ -5385,4 +5385,14 @@ a)"xI
|
||
ab
|
||
aaab
|
||
|
||
+/\n/firstline
|
||
+ xyz\nabc
|
||
+
|
||
+/\nabc/firstline
|
||
+ xyz\nabc
|
||
+
|
||
+/\x{0a}abc/firstline,newline=crlf
|
||
+\= Expect no match
|
||
+ xyz\r\nabc
|
||
+
|
||
# End of testinput2
|
||
diff --git a/testdata/testinput6 b/testdata/testinput6
|
||
index ce2e082..614c3a0 100644
|
||
--- a/testdata/testinput6
|
||
+++ b/testdata/testinput6
|
||
@@ -4932,4 +4932,14 @@
|
||
/(*LIMIT_MATCH=100).*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00\x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););/no_dotstar_anchor
|
||
.*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00\x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););
|
||
|
||
+/\n/firstline
|
||
+ xyz\nabc
|
||
+
|
||
+/\nabc/firstline
|
||
+ xyz\nabc
|
||
+
|
||
+/\x{0a}abc/firstline,newline=crlf
|
||
+\= Expect no match
|
||
+ xyz\r\nabc
|
||
+
|
||
# End of testinput6
|
||
diff --git a/testdata/testoutput2 b/testdata/testoutput2
|
||
index 31ccfbe..c19c270 100644
|
||
--- a/testdata/testoutput2
|
||
+++ b/testdata/testoutput2
|
||
@@ -16368,6 +16368,19 @@ Subject length lower bound = 1
|
||
0: ab
|
||
1: a
|
||
|
||
+/\n/firstline
|
||
+ xyz\nabc
|
||
+ 0: \x0a
|
||
+
|
||
+/\nabc/firstline
|
||
+ xyz\nabc
|
||
+ 0: \x0aabc
|
||
+
|
||
+/\x{0a}abc/firstline,newline=crlf
|
||
+\= Expect no match
|
||
+ xyz\r\nabc
|
||
+No match
|
||
+
|
||
# End of testinput2
|
||
Error -65: PCRE2_ERROR_BADDATA (unknown error number)
|
||
Error -62: bad serialized data
|
||
diff --git a/testdata/testoutput6 b/testdata/testoutput6
|
||
index b912944..2d321d5 100644
|
||
--- a/testdata/testoutput6
|
||
+++ b/testdata/testoutput6
|
||
@@ -7753,4 +7753,17 @@ No match
|
||
.*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00\x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););
|
||
Failed: error -47: match limit exceeded
|
||
|
||
+/\n/firstline
|
||
+ xyz\nabc
|
||
+ 0: \x0a
|
||
+
|
||
+/\nabc/firstline
|
||
+ xyz\nabc
|
||
+ 0: \x0aabc
|
||
+
|
||
+/\x{0a}abc/firstline,newline=crlf
|
||
+\= Expect no match
|
||
+ xyz\r\nabc
|
||
+No match
|
||
+
|
||
# End of testinput6
|
||
--
|
||
2.13.6
|
||
|