Fix optimization bugs for patterns starting with lookaheads
This commit is contained in:
parent
0985ca53fb
commit
14e4e93702
@ -0,0 +1,280 @@
|
||||
From 95d88c4044fb74af8c3259db6413214fe9a106e2 Mon Sep 17 00:00:00 2001
|
||||
From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
|
||||
Date: Tue, 18 Oct 2016 11:22:40 +0000
|
||||
Subject: [PATCH] Fix optimization bugs when pattern starts with lookahead.
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@573 6239d852-aaf2-0410-a92c-79f79f948069
|
||||
|
||||
Petr Písař: Ported to 10.22.
|
||||
|
||||
Signed-off-by: Petr Písař <ppisar@redhat.com>
|
||||
---
|
||||
src/pcre2_compile.c | 63 ++++++++++++++++++++++++++++++----------------------
|
||||
testdata/testinput1 | 6 +++++
|
||||
testdata/testinput2 | 2 ++
|
||||
testdata/testinput4 | 6 +++++
|
||||
testdata/testoutput1 | 8 +++++++
|
||||
testdata/testoutput2 | 6 ++++-
|
||||
testdata/testoutput4 | 8 +++++++
|
||||
7 files changed, 72 insertions(+), 27 deletions(-)
|
||||
|
||||
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
|
||||
index b3ebb25..fe37310 100644
|
||||
--- a/src/pcre2_compile.c
|
||||
+++ b/src/pcre2_compile.c
|
||||
@@ -5553,8 +5553,9 @@ for (;; ptr++)
|
||||
*lengthptr += delta;
|
||||
}
|
||||
|
||||
- /* This is compiling for real. If there is a set first byte for
|
||||
- the group, and we have not yet set a "required byte", set it. */
|
||||
+ /* This is compiling for real. If there is a set first code unit
|
||||
+ for the group, and we have not yet set a "required code unit", set
|
||||
+ it. */
|
||||
|
||||
else
|
||||
{
|
||||
@@ -7185,7 +7186,7 @@ for (;; ptr++)
|
||||
zerofirstcuflags = firstcuflags;
|
||||
groupsetfirstcu = FALSE;
|
||||
|
||||
- if (bravalue >= OP_ONCE)
|
||||
+ if (bravalue >= OP_ONCE) /* Not an assertion */
|
||||
{
|
||||
/* If we have not yet set a firstcu in this branch, take it from the
|
||||
subpattern, remembering that it was set here so that a repeat of more
|
||||
@@ -7225,15 +7226,19 @@ for (;; ptr++)
|
||||
}
|
||||
}
|
||||
|
||||
- /* For a forward assertion, we take the reqcu, if set. This can be
|
||||
- helpful if the pattern that follows the assertion doesn't set a different
|
||||
- char. For example, it's useful for /(?=abcde).+/. We can't set firstcu
|
||||
- for an assertion, however because it leads to incorrect effect for patterns
|
||||
- such as /(?=a)a.+/ when the "real" "a" would then become a reqcu instead
|
||||
- of a firstcu. This is overcome by a scan at the end if there's no
|
||||
- firstcu, looking for an asserted first char. */
|
||||
-
|
||||
- else if (bravalue == OP_ASSERT && subreqcuflags >= 0)
|
||||
+ /* For a forward assertion, we take the reqcu, if set, provided that the
|
||||
+ group has also set a firstcu. This can be helpful if the pattern that
|
||||
+ follows the assertion doesn't set a different char. For example, it's
|
||||
+ useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
|
||||
+ because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
|
||||
+ the "real" "a" would then become a reqcu instead of a firstcu. This is
|
||||
+ overcome by a scan at the end if there's no firstcu, looking for an
|
||||
+ asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
|
||||
+ we must only take the reqcu when the group also set a firstcu. Otherwise,
|
||||
+ in that example, 'X' ends up set for both. */
|
||||
+
|
||||
+ else if (bravalue == OP_ASSERT && subreqcuflags >= 0 &&
|
||||
+ subfirstcuflags >= 0)
|
||||
{
|
||||
reqcu = subreqcu;
|
||||
reqcuflags = subreqcuflags;
|
||||
@@ -8036,8 +8041,8 @@ matching and for non-DOTALL patterns that start with .* (which must start at
|
||||
the beginning or after \n). As in the case of is_anchored() (see above), we
|
||||
have to take account of back references to capturing brackets that contain .*
|
||||
because in that case we can't make the assumption. Also, the appearance of .*
|
||||
-inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
|
||||
-count, because once again the assumption no longer holds.
|
||||
+inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
|
||||
+or *SKIP does not count, because once again the assumption no longer holds.
|
||||
|
||||
Arguments:
|
||||
code points to start of the compiled pattern or a group
|
||||
@@ -8046,13 +8051,14 @@ Arguments:
|
||||
the less precise approach
|
||||
cb points to the compile data
|
||||
atomcount atomic group level
|
||||
+ inassert TRUE if in an assertion
|
||||
|
||||
Returns: TRUE or FALSE
|
||||
*/
|
||||
|
||||
static BOOL
|
||||
is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
|
||||
- int atomcount)
|
||||
+ int atomcount, BOOL inassert)
|
||||
{
|
||||
do {
|
||||
PCRE2_SPTR scode = first_significant_code(
|
||||
@@ -8083,7 +8089,7 @@ do {
|
||||
return FALSE;
|
||||
|
||||
default: /* Assertion */
|
||||
- if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
|
||||
+ if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
|
||||
do scode += GET(scode, 1); while (*scode == OP_ALT);
|
||||
scode += 1 + LINK_SIZE;
|
||||
break;
|
||||
@@ -8097,7 +8103,8 @@ do {
|
||||
if (op == OP_BRA || op == OP_BRAPOS ||
|
||||
op == OP_SBRA || op == OP_SBRAPOS)
|
||||
{
|
||||
- if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
|
||||
+ if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
|
||||
+ return FALSE;
|
||||
}
|
||||
|
||||
/* Capturing brackets */
|
||||
@@ -8107,33 +8114,36 @@ do {
|
||||
{
|
||||
int n = GET2(scode, 1+LINK_SIZE);
|
||||
int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
|
||||
- if (!is_startline(scode, new_map, cb, atomcount)) return FALSE;
|
||||
+ if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
|
||||
}
|
||||
|
||||
/* Positive forward assertions */
|
||||
|
||||
else if (op == OP_ASSERT)
|
||||
{
|
||||
- if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
|
||||
+ if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
|
||||
+ return FALSE;
|
||||
}
|
||||
|
||||
/* Atomic brackets */
|
||||
|
||||
else if (op == OP_ONCE || op == OP_ONCE_NC)
|
||||
{
|
||||
- if (!is_startline(scode, bracket_map, cb, atomcount + 1)) return FALSE;
|
||||
+ if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
|
||||
+ return FALSE;
|
||||
}
|
||||
|
||||
/* .* means "start at start or after \n" if it isn't in atomic brackets or
|
||||
- brackets that may be referenced, as long as the pattern does not contain
|
||||
- *PRUNE or *SKIP, because these break the feature. Consider, for example,
|
||||
- /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
|
||||
- start of a line. There is also an option that disables this optimization. */
|
||||
+ brackets that may be referenced or an assertion, and as long as the pattern
|
||||
+ does not contain *PRUNE or *SKIP, because these break the feature. Consider,
|
||||
+ for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
|
||||
+ i.e. not at the start of a line. There is also an option that disables this
|
||||
+ optimization. */
|
||||
|
||||
else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
|
||||
{
|
||||
if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
|
||||
- atomcount > 0 || cb->had_pruneorskip ||
|
||||
+ atomcount > 0 || cb->had_pruneorskip || inassert ||
|
||||
(cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
|
||||
return FALSE;
|
||||
}
|
||||
@@ -9023,7 +9033,8 @@ if ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0)
|
||||
when *PRUNE and SKIP are not present. (There is an option that disables this
|
||||
case.) */
|
||||
|
||||
- else if (is_startline(codestart, 0, &cb, 0)) re->flags |= PCRE2_STARTLINE;
|
||||
+ else if (is_startline(codestart, 0, &cb, 0, FALSE))
|
||||
+ re->flags |= PCRE2_STARTLINE;
|
||||
}
|
||||
|
||||
/* Handle the "required code unit", if one is set. In the case of an anchored
|
||||
diff --git a/testdata/testinput1 b/testdata/testinput1
|
||||
index 6d7bc80..0d680d3 100644
|
||||
--- a/testdata/testinput1
|
||||
+++ b/testdata/testinput1
|
||||
@@ -5792,4 +5792,10 @@ name)/mark
|
||||
aaaccccaaa
|
||||
bccccb
|
||||
|
||||
+/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/
|
||||
+ \ Fred:099
|
||||
+
|
||||
+/(?=.*X)X$/
|
||||
+ \ X
|
||||
+
|
||||
# End of testinput1
|
||||
diff --git a/testdata/testinput2 b/testdata/testinput2
|
||||
index 2d4bbe5..94fbfab 100644
|
||||
--- a/testdata/testinput2
|
||||
+++ b/testdata/testinput2
|
||||
@@ -4823,4 +4823,6 @@ a)"xI
|
||||
|
||||
/(?<R>abc)(?(R)xyz)/B
|
||||
|
||||
+/(?=.*[A-Z])/I
|
||||
+
|
||||
# End of testinput2
|
||||
diff --git a/testdata/testinput4 b/testdata/testinput4
|
||||
index ce9145d..73582b7 100644
|
||||
--- a/testdata/testinput4
|
||||
+++ b/testdata/testinput4
|
||||
@@ -2282,4 +2282,10 @@
|
||||
\x{389}
|
||||
\x{20ac}
|
||||
|
||||
+/(?=.*b)\pL/
|
||||
+ 11bb
|
||||
+
|
||||
+/(?(?=.*b)(?=.*b)\pL|.*c)/
|
||||
+ 11bb
|
||||
+
|
||||
# End of testinput4
|
||||
diff --git a/testdata/testoutput1 b/testdata/testoutput1
|
||||
index d28bf91..02e07bf 100644
|
||||
--- a/testdata/testoutput1
|
||||
+++ b/testdata/testoutput1
|
||||
@@ -9257,4 +9257,12 @@ No match
|
||||
1: b
|
||||
2: cccc
|
||||
|
||||
+/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/
|
||||
+ \ Fred:099
|
||||
+ 0:
|
||||
+
|
||||
+/(?=.*X)X$/
|
||||
+ \ X
|
||||
+ 0: X
|
||||
+
|
||||
# End of testinput1
|
||||
diff --git a/testdata/testoutput2 b/testdata/testoutput2
|
||||
index ef7e329..bdb289d 100644
|
||||
--- a/testdata/testoutput2
|
||||
+++ b/testdata/testoutput2
|
||||
@@ -8751,7 +8751,6 @@ Subject length lower bound = 1
|
||||
|
||||
/(?(?=.*b).*b|^d)/I
|
||||
Capturing subpattern count = 0
|
||||
-First code unit at start or follows newline
|
||||
Subject length lower bound = 1
|
||||
|
||||
/xyz/auto_callout
|
||||
@@ -15223,6 +15222,11 @@ No match
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
+/(?=.*[A-Z])/I
|
||||
+Capturing subpattern count = 0
|
||||
+May match empty string
|
||||
+Subject length lower bound = 0
|
||||
+
|
||||
# End of testinput2
|
||||
Error -63: PCRE2_ERROR_BADDATA (unknown error number)
|
||||
Error -62: bad serialized data
|
||||
diff --git a/testdata/testoutput4 b/testdata/testoutput4
|
||||
index 701d411..d2d5e51 100644
|
||||
--- a/testdata/testoutput4
|
||||
+++ b/testdata/testoutput4
|
||||
@@ -3703,4 +3703,12 @@ No match
|
||||
\x{20ac}
|
||||
No match
|
||||
|
||||
+/(?=.*b)\pL/
|
||||
+ 11bb
|
||||
+ 0: b
|
||||
+
|
||||
+/(?(?=.*b)(?=.*b)\pL|.*c)/
|
||||
+ 11bb
|
||||
+ 0: b
|
||||
+
|
||||
# End of testinput4
|
||||
--
|
||||
2.7.4
|
||||
|
@ -43,6 +43,9 @@ Patch6: pcre2-10.22-Add-test-for-bug-already-fixed-by-the-refactoring.patch
|
||||
# Fix internal option documentation in pcre2pattern(3), in upstream after 10.22,
|
||||
# upstream bug #1875
|
||||
Patch7: pcre2-10.22-Fix-documentation-error.patch
|
||||
# Fix optimization bugs for patterns starting with lookaheads,
|
||||
# in upstream after 10.22, upstream bug #1882
|
||||
Patch8: pcre2-10.22-Fix-optimization-bugs-when-pattern-starts-with-looka.patch
|
||||
# New libtool to get rid of RPATH and to use distribution autotools
|
||||
BuildRequires: autoconf
|
||||
BuildRequires: automake
|
||||
@ -127,6 +130,7 @@ Utilities demonstrating PCRE2 capabilities like pcre2grep or pcre2test.
|
||||
%patch5 -p1
|
||||
%patch6 -p1
|
||||
%patch7 -p1
|
||||
%patch8 -p1
|
||||
# Because of multilib patch
|
||||
libtoolize --copy --force
|
||||
autoreconf -vif
|
||||
@ -230,6 +234,8 @@ make %{?_smp_mflags} check VERBOSE=yes
|
||||
- Fix miscopmilation of conditionals when a group name start with "R"
|
||||
(upstream bug #1873)
|
||||
- Fix internal option documentation in pcre2pattern(3) (upstream bug #1875)
|
||||
- Fix optimization bugs for patterns starting with lookaheads
|
||||
(upstream bug #1882)
|
||||
|
||||
* Mon Aug 29 2016 Petr Pisar <ppisar@redhat.com> - 10.22-2
|
||||
- Fix matching characters above 255 when a negative character type was used
|
||||
|
Loading…
Reference in New Issue
Block a user