Fix matching at a first code unit of a new line sequence if PCRE2_FIRSTLINE is enabled

This commit is contained in:
Petr Písař 2018-01-12 12:46:29 +01:00
parent 6d626f9a4d
commit 1c9da09ce5
4 changed files with 499 additions and 0 deletions

View File

@ -0,0 +1,61 @@
From 3fdced6eef96f50ac5bd287426db0aa699be3edc Mon Sep 17 00:00:00 2001
From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
Date: Sun, 31 Dec 2017 17:44:12 +0000
Subject: [PATCH] Documentation update.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@898 6239d852-aaf2-0410-a92c-79f79f948069
Petr Písař: Ported to 10.30.
Signed-off-by: Petr Písař <ppisar@redhat.com>
---
doc/pcre2api.3 | 24 +++++++++++++++---------
1 file changed, 15 insertions(+), 9 deletions(-)
diff --git a/doc/pcre2api.3 b/doc/pcre2api.3
index f80ae58..d55debf 100644
--- a/doc/pcre2api.3
+++ b/doc/pcre2api.3
@@ -871,10 +871,11 @@ compiled. If a match is started with a non-default match limit when
PCRE2_USE_OFFSET_LIMIT is not set, an error is generated.
.P
The offset limit facility can be used to track progress when searching large
-subject strings. See also the PCRE2_FIRSTLINE option, which requires a match to
-start within the first line of the subject. If this is set with an offset
-limit, a match must occur in the first line and also within the offset limit.
-In other words, whichever limit comes first is used.
+subject strings. See also the PCRE2_FIRSTLINE option, which requires a match
+to start before or at the first newline that follows the start of matching in
+the subject. If this is set with an offset limit, a match must occur in the
+first line and also within the offset limit. In other words, whichever limit
+comes first is used.
.sp
.nf
.B int pcre2_set_heap_limit(pcre2_match_context *\fImcontext\fP,
@@ -1423,11 +1424,16 @@ changed within a pattern by a (?xx) option setting.
PCRE2_FIRSTLINE
.sp
If this option is set, the start of an unanchored pattern match must be before
-or at the first newline in the subject string, though the matched text may
-continue over the newline. See also PCRE2_USE_OFFSET_LIMIT, which provides a
-more general limiting facility. If PCRE2_FIRSTLINE is set with an offset limit,
-a match must occur in the first line and also within the offset limit. In other
-words, whichever limit comes first is used.
+or at the first newline in the subject string following the start of matching,
+though the matched text may continue over the newline. If \fIstartoffset\fP is
+non-zero, the limiting newline is not necessarily the first newline in the
+subject. For example, if the subject string is "abc\enxyz" (where \en
+represents a single-character newline) a pattern match for "yz" succeeds with
+PCRE2_FIRSTLINE if \fIstartoffset\fP is greater than 3. See also
+PCRE2_USE_OFFSET_LIMIT, which provides a more general limiting facility. If
+PCRE2_FIRSTLINE is set with an offset limit, a match must occur in the first
+line and also within the offset limit. In other words, whichever limit comes
+first is used.
.sp
PCRE2_LITERAL
.sp
--
2.13.6

View File

@ -0,0 +1,206 @@
From f3b22988611cca57770a705f05c0d9ef583d605a Mon Sep 17 00:00:00 2001
From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
Date: Mon, 1 Jan 2018 14:12:35 +0000
Subject: [PATCH] Fix PCRE2_FIRSTLINE bug when a pattern match starts with the
first code unit of a newline sequence.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@899 6239d852-aaf2-0410-a92c-79f79f948069
Petr Písař: Ported to 10.32
Signed-off-by: Petr Písař <ppisar@redhat.com>
---
src/pcre2_dfa_match.c | 21 +++++++++++++++------
src/pcre2_match.c | 19 ++++++++++++++-----
testdata/testinput2 | 10 ++++++++++
testdata/testinput6 | 10 ++++++++++
testdata/testoutput2 | 13 +++++++++++++
testdata/testoutput6 | 13 +++++++++++++
6 files changed, 75 insertions(+), 11 deletions(-)
diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c
index 5ae1394..7bbd6d3 100644
--- a/src/pcre2_dfa_match.c
+++ b/src/pcre2_dfa_match.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
- New API code Copyright (c) 2016-2017 University of Cambridge
+ New API code Copyright (c) 2016-2018 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -3558,9 +3558,11 @@ for (;;)
/* If firstline is TRUE, the start of the match is constrained to the first
line of a multiline string. That is, the match must be before or at the
- first newline. Implement this by temporarily adjusting end_subject so that
- we stop the optimization scans for a first code unit at a newline. If the
- match fails at the newline, later code breaks this loop. */
+ first newline following the start of matching. Temporarily adjust
+ end_subject so that we stop the optimization scans for a first code unit
+ immediately after the first character of a newline (the first code unit can
+ legitimately be a newline). If the match fails at the newline, later code
+ breaks this loop. */
if (firstline)
{
@@ -3568,7 +3570,7 @@ for (;;)
#ifdef SUPPORT_UNICODE
if (utf)
{
- while (t < mb->end_subject && !IS_NEWLINE(t))
+ while (t < end_subject && !IS_NEWLINE(t))
{
t++;
ACROSSCHAR(t < end_subject, *t, t++);
@@ -3576,7 +3578,14 @@ for (;;)
}
else
#endif
- while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
+ while (t < end_subject && !IS_NEWLINE(t)) t++;
+
+ /* Note that we only need to advance by one code unit if we found a
+ newline. If the newline is CRLF, a first code unit of LF should not
+ match, because it is not at or before the newline. Similarly, only the
+ first code unit of a Unicode newline might be relevant. */
+
+ if (t < end_subject) t++;
end_subject = t;
}
diff --git a/src/pcre2_match.c b/src/pcre2_match.c
index 050b7e9..8afb0d8 100644
--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@@ -6541,9 +6541,11 @@ for(;;)
/* If firstline is TRUE, the start of the match is constrained to the first
line of a multiline string. That is, the match must be before or at the
- first newline. Implement this by temporarily adjusting end_subject so that
- we stop the optimization scans for a first code unit at a newline. If the
- match fails at the newline, later code breaks this loop. */
+ first newline following the start of matching. Temporarily adjust
+ end_subject so that we stop the optimization scans for a first code unit
+ immediately after the first character of a newline (the first code unit can
+ legitimately be a newline). If the match fails at the newline, later code
+ breaks this loop. */
if (firstline)
{
@@ -6551,7 +6553,7 @@ for(;;)
#ifdef SUPPORT_UNICODE
if (utf)
{
- while (t < mb->end_subject && !IS_NEWLINE(t))
+ while (t < end_subject && !IS_NEWLINE(t))
{
t++;
ACROSSCHAR(t < end_subject, *t, t++);
@@ -6559,7 +6561,14 @@ for(;;)
}
else
#endif
- while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
+ while (t < end_subject && !IS_NEWLINE(t)) t++;
+
+ /* Note that we only need to advance by one code unit if we found a
+ newline. If the newline is CRLF, a first code unit of LF should not
+ match, because it is not at or before the newline. Similarly, only the
+ first code unit of a Unicode newline might be relevant. */
+
+ if (t < end_subject) t++;
end_subject = t;
}
diff --git a/testdata/testinput2 b/testdata/testinput2
index 695f0a4..b173fe0 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -5385,4 +5385,14 @@ a)"xI
ab
aaab
+/\n/firstline
+ xyz\nabc
+
+/\nabc/firstline
+ xyz\nabc
+
+/\x{0a}abc/firstline,newline=crlf
+\= Expect no match
+ xyz\r\nabc
+
# End of testinput2
diff --git a/testdata/testinput6 b/testdata/testinput6
index ce2e082..614c3a0 100644
--- a/testdata/testinput6
+++ b/testdata/testinput6
@@ -4932,4 +4932,14 @@
/(*LIMIT_MATCH=100).*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00 \x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););/no_dotstar_anchor
.*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00 \x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););
+/\n/firstline
+ xyz\nabc
+
+/\nabc/firstline
+ xyz\nabc
+
+/\x{0a}abc/firstline,newline=crlf
+\= Expect no match
+ xyz\r\nabc
+
# End of testinput6
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 31ccfbe..c19c270 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -16368,6 +16368,19 @@ Subject length lower bound = 1
0: ab
1: a
+/\n/firstline
+ xyz\nabc
+ 0: \x0a
+
+/\nabc/firstline
+ xyz\nabc
+ 0: \x0aabc
+
+/\x{0a}abc/firstline,newline=crlf
+\= Expect no match
+ xyz\r\nabc
+No match
+
# End of testinput2
Error -65: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data
diff --git a/testdata/testoutput6 b/testdata/testoutput6
index b912944..2d321d5 100644
--- a/testdata/testoutput6
+++ b/testdata/testoutput6
@@ -7753,4 +7753,17 @@ No match
.*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00 \x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););
Failed: error -47: match limit exceeded
+/\n/firstline
+ xyz\nabc
+ 0: \x0a
+
+/\nabc/firstline
+ xyz\nabc
+ 0: \x0aabc
+
+/\x{0a}abc/firstline,newline=crlf
+\= Expect no match
+ xyz\r\nabc
+No match
+
# End of testinput6
--
2.13.6

View File

@ -0,0 +1,218 @@
From 1b5d77c6edc5ee8e8fe5c96bf9cad5798d6ce36c Mon Sep 17 00:00:00 2001
From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
Date: Mon, 1 Jan 2018 14:54:06 +0000
Subject: [PATCH 3/3] Previous FIRSTLINE patch was broken. Fix it.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@900 6239d852-aaf2-0410-a92c-79f79f948069
Signed-off-by: Petr Písař <ppisar@redhat.com>
---
src/pcre2_dfa_match.c | 27 +++++++++++----------------
src/pcre2_match.c | 37 +++++++++++++++----------------------
testdata/testinput2 | 4 ++++
testdata/testinput6 | 4 ++++
testdata/testoutput2 | 5 +++++
testdata/testoutput6 | 5 +++++
6 files changed, 44 insertions(+), 38 deletions(-)
diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c
index 9c1d805..65243bf 100644
--- a/src/pcre2_dfa_match.c
+++ b/src/pcre2_dfa_match.c
@@ -3363,8 +3363,6 @@ for (;;)
if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
(options & PCRE2_DFA_RESTART) == 0)
{
- PCRE2_SPTR save_end_subject = end_subject;
-
/* If firstline is TRUE, the start of the match is constrained to the first
line of a multiline string. That is, the match must be before or at the
first newline following the start of matching. Temporarily adjust
@@ -3388,13 +3386,6 @@ for (;;)
else
#endif
while (t < end_subject && !IS_NEWLINE(t)) t++;
-
- /* Note that we only need to advance by one code unit if we found a
- newline. If the newline is CRLF, a first code unit of LF should not
- match, because it is not at or before the newline. Similarly, only the
- first code unit of a Unicode newline might be relevant. */
-
- if (t < end_subject) t++;
end_subject = t;
}
@@ -3466,14 +3457,18 @@ for (;;)
#endif
}
- /* If we can't find the required code unit, break the bumpalong loop,
- to force a match failure, except when doing partial matching, when we
- let the next cycle run at the end of the subject. To see why, consider
- the pattern /(?<=abc)def/, which partially matches "abc", even though
- the string does not contain the starting character "d". */
+ /* If we can't find the required code unit, having reached the true end
+ of the subject, break the bumpalong loop, to force a match failure,
+ except when doing partial matching, when we let the next cycle run at
+ the end of the subject. To see why, consider the pattern /(?<=abc)def/,
+ which partially matches "abc", even though the string does not contain
+ the starting character "d". If we have not reached the true end of the
+ subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
+ we also let the cycle run, because the matching string is legitimately
+ allowed to start with the first code unit of a newline. */
if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
- start_match >= end_subject)
+ start_match >= mb->end_subject)
break;
}
@@ -3532,7 +3527,7 @@ for (;;)
/* Restore fudged end_subject */
- end_subject = save_end_subject;
+ end_subject = mb->end_subject;
/* The following two optimizations are disabled for partial matching. */
diff --git a/src/pcre2_match.c b/src/pcre2_match.c
index 8872345..c6b6975 100644
--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
- New API code Copyright (c) 2015-2017 University of Cambridge
+ New API code Copyright (c) 2015-2018 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -6363,15 +6363,11 @@ for(;;)
if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
{
- PCRE2_SPTR save_end_subject = end_subject;
-
/* If firstline is TRUE, the start of the match is constrained to the first
line of a multiline string. That is, the match must be before or at the
first newline following the start of matching. Temporarily adjust
- end_subject so that we stop the optimization scans for a first code unit
- immediately after the first character of a newline (the first code unit can
- legitimately be a newline). If the match fails at the newline, later code
- breaks this loop. */
+ end_subject so that we stop the scans for a first code unit at a newline.
+ If the match fails at the newline, later code breaks the loop. */
if (firstline)
{
@@ -6388,13 +6384,6 @@ for(;;)
else
#endif
while (t < end_subject && !IS_NEWLINE(t)) t++;
-
- /* Note that we only need to advance by one code unit if we found a
- newline. If the newline is CRLF, a first code unit of LF should not
- match, because it is not at or before the newline. Similarly, only the
- first code unit of a Unicode newline might be relevant. */
-
- if (t < end_subject) t++;
end_subject = t;
}
@@ -6470,13 +6459,17 @@ for(;;)
#endif
}
- /* If we can't find the required code unit, break the bumpalong loop,
- to force a match failure, except when doing partial matching, when we
- let the next cycle run at the end of the subject. To see why, consider
- the pattern /(?<=abc)def/, which partially matches "abc", even though
- the string does not contain the starting character "d". */
-
- if (!mb->partial && start_match >= end_subject)
+ /* If we can't find the required code unit, having reached the true end
+ of the subject, break the bumpalong loop, to force a match failure,
+ except when doing partial matching, when we let the next cycle run at
+ the end of the subject. To see why, consider the pattern /(?<=abc)def/,
+ which partially matches "abc", even though the string does not contain
+ the starting character "d". If we have not reached the true end of the
+ subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
+ we also let the cycle run, because the matching string is legitimately
+ allowed to start with the first code unit of a newline. */
+
+ if (!mb->partial && start_match >= mb->end_subject)
{
rc = MATCH_NOMATCH;
break;
@@ -6538,7 +6531,7 @@ for(;;)
/* Restore fudged end_subject */
- end_subject = save_end_subject;
+ end_subject = mb->end_subject;
/* The following two optimizations must be disabled for partial matching. */
diff --git a/testdata/testinput2 b/testdata/testinput2
index fe8efbf..36e4454 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -5405,4 +5405,8 @@ a)"xI
\= Expect no match
xyz\r\nabc
+/[abc]/firstline
+\= Expect no match
+ \na
+
# End of testinput2
diff --git a/testdata/testinput6 b/testdata/testinput6
index 614c3a0..e2f00c0 100644
--- a/testdata/testinput6
+++ b/testdata/testinput6
@@ -4942,4 +4942,8 @@
\= Expect no match
xyz\r\nabc
+/[abc]/firstline
+\= Expect no match
+ \na
+
# End of testinput6
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 62ec12f..f146c0c 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -16453,6 +16453,11 @@ No match
xyz\r\nabc
No match
+/[abc]/firstline
+\= Expect no match
+ \na
+No match
+
# End of testinput2
Error -65: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data
diff --git a/testdata/testoutput6 b/testdata/testoutput6
index 998f20b..b409fe0 100644
--- a/testdata/testoutput6
+++ b/testdata/testoutput6
@@ -7766,4 +7766,9 @@ Failed: error -47: match limit exceeded
xyz\r\nabc
No match
+/[abc]/firstline
+\= Expect no match
+ \na
+No match
+
# End of testinput6
--
2.13.6

View File

@ -71,6 +71,15 @@ Patch9: pcre2-10.30-Documentation-update.patch
# Fix handling \K in an assertion in pcre2grep tool, upstream bug #2211, # Fix handling \K in an assertion in pcre2grep tool, upstream bug #2211,
# in upstream after 10.30 # in upstream after 10.30
Patch10: pcre2-10.30-Fix-K-issues-in-pcre2grep.patch Patch10: pcre2-10.30-Fix-K-issues-in-pcre2grep.patch
# 1/3 Fix matching at a first code unit of a new line sequence if
# PCRE2_FIRSTLINE is enabled, in upstream after 10.30
Patch11: pcre2-10.30-FIRSTLINE_documentation-update.patch
# 2/3 Fix matching at a first code unit of a new line sequence if
# PCRE2_FIRSTLINE is enabled, in upstream after 10.30
Patch12: pcre2-10.30-Fix-PCRE2_FIRSTLINE-bug-when-a-pattern-match-starts-.patch
# 3/3 Fix matching at a first code unit of a new line sequence if
# PCRE2_FIRSTLINE is enabled, in upstream after 10.30
Patch13: pcre2-10.30-Previous-FIRSTLINE-patch-was-broken.-Fix-it.patch
BuildRequires: autoconf BuildRequires: autoconf
BuildRequires: automake BuildRequires: automake
BuildRequires: coreutils BuildRequires: coreutils
@ -155,6 +164,9 @@ Utilities demonstrating PCRE2 capabilities like pcre2grep or pcre2test.
%patch8 -p1 %patch8 -p1
%patch9 -p1 %patch9 -p1
%patch10 -p1 %patch10 -p1
%patch11 -p1
%patch12 -p1
%patch13 -p1
# Because of multilib patch # Because of multilib patch
libtoolize --copy --force libtoolize --copy --force
autoreconf -vif autoreconf -vif
@ -261,6 +273,8 @@ make %{?_smp_mflags} check VERBOSE=yes
* Fri Jan 12 2018 Petr Pisar <ppisar@redhat.com> - 10.30-5 * Fri Jan 12 2018 Petr Pisar <ppisar@redhat.com> - 10.30-5
- Fix handling \K in an assertion in pcre2grep tool and documentation - Fix handling \K in an assertion in pcre2grep tool and documentation
(upstream bug #2211) (upstream bug #2211)
- Fix matching at a first code unit of a new line sequence if PCRE2_FIRSTLINE
is enabled
* Fri Dec 22 2017 Petr Pisar <ppisar@redhat.com> - 10.30-4 * Fri Dec 22 2017 Petr Pisar <ppisar@redhat.com> - 10.30-4
- Fix pcre2_jit_match() to properly check the pattern was JIT-compiled - Fix pcre2_jit_match() to properly check the pattern was JIT-compiled