Fix optimization bugs for patterns starting with lookaheads

This commit is contained in:
Petr Písař 2016-10-19 13:38:16 +02:00
parent 0985ca53fb
commit 14e4e93702
2 changed files with 286 additions and 0 deletions

View File

@ -0,0 +1,280 @@
From 95d88c4044fb74af8c3259db6413214fe9a106e2 Mon Sep 17 00:00:00 2001
From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
Date: Tue, 18 Oct 2016 11:22:40 +0000
Subject: [PATCH] Fix optimization bugs when pattern starts with lookahead.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@573 6239d852-aaf2-0410-a92c-79f79f948069
Petr Písař: Ported to 10.22.
Signed-off-by: Petr Písař <ppisar@redhat.com>
---
src/pcre2_compile.c | 63 ++++++++++++++++++++++++++++++----------------------
testdata/testinput1 | 6 +++++
testdata/testinput2 | 2 ++
testdata/testinput4 | 6 +++++
testdata/testoutput1 | 8 +++++++
testdata/testoutput2 | 6 ++++-
testdata/testoutput4 | 8 +++++++
7 files changed, 72 insertions(+), 27 deletions(-)
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index b3ebb25..fe37310 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -5553,8 +5553,9 @@ for (;; ptr++)
*lengthptr += delta;
}
- /* This is compiling for real. If there is a set first byte for
- the group, and we have not yet set a "required byte", set it. */
+ /* This is compiling for real. If there is a set first code unit
+ for the group, and we have not yet set a "required code unit", set
+ it. */
else
{
@@ -7185,7 +7186,7 @@ for (;; ptr++)
zerofirstcuflags = firstcuflags;
groupsetfirstcu = FALSE;
- if (bravalue >= OP_ONCE)
+ if (bravalue >= OP_ONCE) /* Not an assertion */
{
/* If we have not yet set a firstcu in this branch, take it from the
subpattern, remembering that it was set here so that a repeat of more
@@ -7225,15 +7226,19 @@ for (;; ptr++)
}
}
- /* For a forward assertion, we take the reqcu, if set. This can be
- helpful if the pattern that follows the assertion doesn't set a different
- char. For example, it's useful for /(?=abcde).+/. We can't set firstcu
- for an assertion, however because it leads to incorrect effect for patterns
- such as /(?=a)a.+/ when the "real" "a" would then become a reqcu instead
- of a firstcu. This is overcome by a scan at the end if there's no
- firstcu, looking for an asserted first char. */
-
- else if (bravalue == OP_ASSERT && subreqcuflags >= 0)
+ /* For a forward assertion, we take the reqcu, if set, provided that the
+ group has also set a firstcu. This can be helpful if the pattern that
+ follows the assertion doesn't set a different char. For example, it's
+ useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
+ because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
+ the "real" "a" would then become a reqcu instead of a firstcu. This is
+ overcome by a scan at the end if there's no firstcu, looking for an
+ asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
+ we must only take the reqcu when the group also set a firstcu. Otherwise,
+ in that example, 'X' ends up set for both. */
+
+ else if (bravalue == OP_ASSERT && subreqcuflags >= 0 &&
+ subfirstcuflags >= 0)
{
reqcu = subreqcu;
reqcuflags = subreqcuflags;
@@ -8036,8 +8041,8 @@ matching and for non-DOTALL patterns that start with .* (which must start at
the beginning or after \n). As in the case of is_anchored() (see above), we
have to take account of back references to capturing brackets that contain .*
because in that case we can't make the assumption. Also, the appearance of .*
-inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
-count, because once again the assumption no longer holds.
+inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
+or *SKIP does not count, because once again the assumption no longer holds.
Arguments:
code points to start of the compiled pattern or a group
@@ -8046,13 +8051,14 @@ Arguments:
the less precise approach
cb points to the compile data
atomcount atomic group level
+ inassert TRUE if in an assertion
Returns: TRUE or FALSE
*/
static BOOL
is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
- int atomcount)
+ int atomcount, BOOL inassert)
{
do {
PCRE2_SPTR scode = first_significant_code(
@@ -8083,7 +8089,7 @@ do {
return FALSE;
default: /* Assertion */
- if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
+ if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
do scode += GET(scode, 1); while (*scode == OP_ALT);
scode += 1 + LINK_SIZE;
break;
@@ -8097,7 +8103,8 @@ do {
if (op == OP_BRA || op == OP_BRAPOS ||
op == OP_SBRA || op == OP_SBRAPOS)
{
- if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
+ if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
+ return FALSE;
}
/* Capturing brackets */
@@ -8107,33 +8114,36 @@ do {
{
int n = GET2(scode, 1+LINK_SIZE);
int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
- if (!is_startline(scode, new_map, cb, atomcount)) return FALSE;
+ if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
}
/* Positive forward assertions */
else if (op == OP_ASSERT)
{
- if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
+ if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
+ return FALSE;
}
/* Atomic brackets */
else if (op == OP_ONCE || op == OP_ONCE_NC)
{
- if (!is_startline(scode, bracket_map, cb, atomcount + 1)) return FALSE;
+ if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
+ return FALSE;
}
/* .* means "start at start or after \n" if it isn't in atomic brackets or
- brackets that may be referenced, as long as the pattern does not contain
- *PRUNE or *SKIP, because these break the feature. Consider, for example,
- /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
- start of a line. There is also an option that disables this optimization. */
+ brackets that may be referenced or an assertion, and as long as the pattern
+ does not contain *PRUNE or *SKIP, because these break the feature. Consider,
+ for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
+ i.e. not at the start of a line. There is also an option that disables this
+ optimization. */
else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
{
if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
- atomcount > 0 || cb->had_pruneorskip ||
+ atomcount > 0 || cb->had_pruneorskip || inassert ||
(cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
return FALSE;
}
@@ -9023,7 +9033,8 @@ if ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0)
when *PRUNE and SKIP are not present. (There is an option that disables this
case.) */
- else if (is_startline(codestart, 0, &cb, 0)) re->flags |= PCRE2_STARTLINE;
+ else if (is_startline(codestart, 0, &cb, 0, FALSE))
+ re->flags |= PCRE2_STARTLINE;
}
/* Handle the "required code unit", if one is set. In the case of an anchored
diff --git a/testdata/testinput1 b/testdata/testinput1
index 6d7bc80..0d680d3 100644
--- a/testdata/testinput1
+++ b/testdata/testinput1
@@ -5792,4 +5792,10 @@ name)/mark
aaaccccaaa
bccccb
+/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/
+ \ Fred:099
+
+/(?=.*X)X$/
+ \ X
+
# End of testinput1
diff --git a/testdata/testinput2 b/testdata/testinput2
index 2d4bbe5..94fbfab 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -4823,4 +4823,6 @@ a)"xI
/(?<R>abc)(?(R)xyz)/B
+/(?=.*[A-Z])/I
+
# End of testinput2
diff --git a/testdata/testinput4 b/testdata/testinput4
index ce9145d..73582b7 100644
--- a/testdata/testinput4
+++ b/testdata/testinput4
@@ -2282,4 +2282,10 @@
\x{389}
\x{20ac}
+/(?=.*b)\pL/
+ 11bb
+
+/(?(?=.*b)(?=.*b)\pL|.*c)/
+ 11bb
+
# End of testinput4
diff --git a/testdata/testoutput1 b/testdata/testoutput1
index d28bf91..02e07bf 100644
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@@ -9257,4 +9257,12 @@ No match
1: b
2: cccc
+/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/
+ \ Fred:099
+ 0:
+
+/(?=.*X)X$/
+ \ X
+ 0: X
+
# End of testinput1
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index ef7e329..bdb289d 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -8751,7 +8751,6 @@ Subject length lower bound = 1
/(?(?=.*b).*b|^d)/I
Capturing subpattern count = 0
-First code unit at start or follows newline
Subject length lower bound = 1
/xyz/auto_callout
@@ -15223,6 +15222,11 @@ No match
End
------------------------------------------------------------------
+/(?=.*[A-Z])/I
+Capturing subpattern count = 0
+May match empty string
+Subject length lower bound = 0
+
# End of testinput2
Error -63: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data
diff --git a/testdata/testoutput4 b/testdata/testoutput4
index 701d411..d2d5e51 100644
--- a/testdata/testoutput4
+++ b/testdata/testoutput4
@@ -3703,4 +3703,12 @@ No match
\x{20ac}
No match
+/(?=.*b)\pL/
+ 11bb
+ 0: b
+
+/(?(?=.*b)(?=.*b)\pL|.*c)/
+ 11bb
+ 0: b
+
# End of testinput4
--
2.7.4

View File

@ -43,6 +43,9 @@ Patch6: pcre2-10.22-Add-test-for-bug-already-fixed-by-the-refactoring.patch
# Fix internal option documentation in pcre2pattern(3), in upstream after 10.22,
# upstream bug #1875
Patch7: pcre2-10.22-Fix-documentation-error.patch
# Fix optimization bugs for patterns starting with lookaheads,
# in upstream after 10.22, upstream bug #1882
Patch8: pcre2-10.22-Fix-optimization-bugs-when-pattern-starts-with-looka.patch
# New libtool to get rid of RPATH and to use distribution autotools
BuildRequires: autoconf
BuildRequires: automake
@ -127,6 +130,7 @@ Utilities demonstrating PCRE2 capabilities like pcre2grep or pcre2test.
%patch5 -p1
%patch6 -p1
%patch7 -p1
%patch8 -p1
# Because of multilib patch
libtoolize --copy --force
autoreconf -vif
@ -230,6 +234,8 @@ make %{?_smp_mflags} check VERBOSE=yes
- Fix miscopmilation of conditionals when a group name start with "R"
(upstream bug #1873)
- Fix internal option documentation in pcre2pattern(3) (upstream bug #1875)
- Fix optimization bugs for patterns starting with lookaheads
(upstream bug #1882)
* Mon Aug 29 2016 Petr Pisar <ppisar@redhat.com> - 10.22-2
- Fix matching characters above 255 when a negative character type was used