281 lines
9.5 KiB
Diff
281 lines
9.5 KiB
Diff
|
From 95d88c4044fb74af8c3259db6413214fe9a106e2 Mon Sep 17 00:00:00 2001
|
||
|
From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
|
||
|
Date: Tue, 18 Oct 2016 11:22:40 +0000
|
||
|
Subject: [PATCH] Fix optimization bugs when pattern starts with lookahead.
|
||
|
MIME-Version: 1.0
|
||
|
Content-Type: text/plain; charset=UTF-8
|
||
|
Content-Transfer-Encoding: 8bit
|
||
|
|
||
|
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@573 6239d852-aaf2-0410-a92c-79f79f948069
|
||
|
|
||
|
Petr Písař: Ported to 10.22.
|
||
|
|
||
|
Signed-off-by: Petr Písař <ppisar@redhat.com>
|
||
|
---
|
||
|
src/pcre2_compile.c | 63 ++++++++++++++++++++++++++++++----------------------
|
||
|
testdata/testinput1 | 6 +++++
|
||
|
testdata/testinput2 | 2 ++
|
||
|
testdata/testinput4 | 6 +++++
|
||
|
testdata/testoutput1 | 8 +++++++
|
||
|
testdata/testoutput2 | 6 ++++-
|
||
|
testdata/testoutput4 | 8 +++++++
|
||
|
7 files changed, 72 insertions(+), 27 deletions(-)
|
||
|
|
||
|
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
|
||
|
index b3ebb25..fe37310 100644
|
||
|
--- a/src/pcre2_compile.c
|
||
|
+++ b/src/pcre2_compile.c
|
||
|
@@ -5553,8 +5553,9 @@ for (;; ptr++)
|
||
|
*lengthptr += delta;
|
||
|
}
|
||
|
|
||
|
- /* This is compiling for real. If there is a set first byte for
|
||
|
- the group, and we have not yet set a "required byte", set it. */
|
||
|
+ /* This is compiling for real. If there is a set first code unit
|
||
|
+ for the group, and we have not yet set a "required code unit", set
|
||
|
+ it. */
|
||
|
|
||
|
else
|
||
|
{
|
||
|
@@ -7185,7 +7186,7 @@ for (;; ptr++)
|
||
|
zerofirstcuflags = firstcuflags;
|
||
|
groupsetfirstcu = FALSE;
|
||
|
|
||
|
- if (bravalue >= OP_ONCE)
|
||
|
+ if (bravalue >= OP_ONCE) /* Not an assertion */
|
||
|
{
|
||
|
/* If we have not yet set a firstcu in this branch, take it from the
|
||
|
subpattern, remembering that it was set here so that a repeat of more
|
||
|
@@ -7225,15 +7226,19 @@ for (;; ptr++)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
- /* For a forward assertion, we take the reqcu, if set. This can be
|
||
|
- helpful if the pattern that follows the assertion doesn't set a different
|
||
|
- char. For example, it's useful for /(?=abcde).+/. We can't set firstcu
|
||
|
- for an assertion, however because it leads to incorrect effect for patterns
|
||
|
- such as /(?=a)a.+/ when the "real" "a" would then become a reqcu instead
|
||
|
- of a firstcu. This is overcome by a scan at the end if there's no
|
||
|
- firstcu, looking for an asserted first char. */
|
||
|
-
|
||
|
- else if (bravalue == OP_ASSERT && subreqcuflags >= 0)
|
||
|
+ /* For a forward assertion, we take the reqcu, if set, provided that the
|
||
|
+ group has also set a firstcu. This can be helpful if the pattern that
|
||
|
+ follows the assertion doesn't set a different char. For example, it's
|
||
|
+ useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
|
||
|
+ because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
|
||
|
+ the "real" "a" would then become a reqcu instead of a firstcu. This is
|
||
|
+ overcome by a scan at the end if there's no firstcu, looking for an
|
||
|
+ asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
|
||
|
+ we must only take the reqcu when the group also set a firstcu. Otherwise,
|
||
|
+ in that example, 'X' ends up set for both. */
|
||
|
+
|
||
|
+ else if (bravalue == OP_ASSERT && subreqcuflags >= 0 &&
|
||
|
+ subfirstcuflags >= 0)
|
||
|
{
|
||
|
reqcu = subreqcu;
|
||
|
reqcuflags = subreqcuflags;
|
||
|
@@ -8036,8 +8041,8 @@ matching and for non-DOTALL patterns that start with .* (which must start at
|
||
|
the beginning or after \n). As in the case of is_anchored() (see above), we
|
||
|
have to take account of back references to capturing brackets that contain .*
|
||
|
because in that case we can't make the assumption. Also, the appearance of .*
|
||
|
-inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
|
||
|
-count, because once again the assumption no longer holds.
|
||
|
+inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
|
||
|
+or *SKIP does not count, because once again the assumption no longer holds.
|
||
|
|
||
|
Arguments:
|
||
|
code points to start of the compiled pattern or a group
|
||
|
@@ -8046,13 +8051,14 @@ Arguments:
|
||
|
the less precise approach
|
||
|
cb points to the compile data
|
||
|
atomcount atomic group level
|
||
|
+ inassert TRUE if in an assertion
|
||
|
|
||
|
Returns: TRUE or FALSE
|
||
|
*/
|
||
|
|
||
|
static BOOL
|
||
|
is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
|
||
|
- int atomcount)
|
||
|
+ int atomcount, BOOL inassert)
|
||
|
{
|
||
|
do {
|
||
|
PCRE2_SPTR scode = first_significant_code(
|
||
|
@@ -8083,7 +8089,7 @@ do {
|
||
|
return FALSE;
|
||
|
|
||
|
default: /* Assertion */
|
||
|
- if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
|
||
|
+ if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
|
||
|
do scode += GET(scode, 1); while (*scode == OP_ALT);
|
||
|
scode += 1 + LINK_SIZE;
|
||
|
break;
|
||
|
@@ -8097,7 +8103,8 @@ do {
|
||
|
if (op == OP_BRA || op == OP_BRAPOS ||
|
||
|
op == OP_SBRA || op == OP_SBRAPOS)
|
||
|
{
|
||
|
- if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
|
||
|
+ if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
|
||
|
+ return FALSE;
|
||
|
}
|
||
|
|
||
|
/* Capturing brackets */
|
||
|
@@ -8107,33 +8114,36 @@ do {
|
||
|
{
|
||
|
int n = GET2(scode, 1+LINK_SIZE);
|
||
|
int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
|
||
|
- if (!is_startline(scode, new_map, cb, atomcount)) return FALSE;
|
||
|
+ if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
|
||
|
}
|
||
|
|
||
|
/* Positive forward assertions */
|
||
|
|
||
|
else if (op == OP_ASSERT)
|
||
|
{
|
||
|
- if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
|
||
|
+ if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
|
||
|
+ return FALSE;
|
||
|
}
|
||
|
|
||
|
/* Atomic brackets */
|
||
|
|
||
|
else if (op == OP_ONCE || op == OP_ONCE_NC)
|
||
|
{
|
||
|
- if (!is_startline(scode, bracket_map, cb, atomcount + 1)) return FALSE;
|
||
|
+ if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
|
||
|
+ return FALSE;
|
||
|
}
|
||
|
|
||
|
/* .* means "start at start or after \n" if it isn't in atomic brackets or
|
||
|
- brackets that may be referenced, as long as the pattern does not contain
|
||
|
- *PRUNE or *SKIP, because these break the feature. Consider, for example,
|
||
|
- /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
|
||
|
- start of a line. There is also an option that disables this optimization. */
|
||
|
+ brackets that may be referenced or an assertion, and as long as the pattern
|
||
|
+ does not contain *PRUNE or *SKIP, because these break the feature. Consider,
|
||
|
+ for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
|
||
|
+ i.e. not at the start of a line. There is also an option that disables this
|
||
|
+ optimization. */
|
||
|
|
||
|
else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
|
||
|
{
|
||
|
if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
|
||
|
- atomcount > 0 || cb->had_pruneorskip ||
|
||
|
+ atomcount > 0 || cb->had_pruneorskip || inassert ||
|
||
|
(cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
|
||
|
return FALSE;
|
||
|
}
|
||
|
@@ -9023,7 +9033,8 @@ if ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0)
|
||
|
when *PRUNE and SKIP are not present. (There is an option that disables this
|
||
|
case.) */
|
||
|
|
||
|
- else if (is_startline(codestart, 0, &cb, 0)) re->flags |= PCRE2_STARTLINE;
|
||
|
+ else if (is_startline(codestart, 0, &cb, 0, FALSE))
|
||
|
+ re->flags |= PCRE2_STARTLINE;
|
||
|
}
|
||
|
|
||
|
/* Handle the "required code unit", if one is set. In the case of an anchored
|
||
|
diff --git a/testdata/testinput1 b/testdata/testinput1
|
||
|
index 6d7bc80..0d680d3 100644
|
||
|
--- a/testdata/testinput1
|
||
|
+++ b/testdata/testinput1
|
||
|
@@ -5792,4 +5792,10 @@ name)/mark
|
||
|
aaaccccaaa
|
||
|
bccccb
|
||
|
|
||
|
+/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/
|
||
|
+ \ Fred:099
|
||
|
+
|
||
|
+/(?=.*X)X$/
|
||
|
+ \ X
|
||
|
+
|
||
|
# End of testinput1
|
||
|
diff --git a/testdata/testinput2 b/testdata/testinput2
|
||
|
index 2d4bbe5..94fbfab 100644
|
||
|
--- a/testdata/testinput2
|
||
|
+++ b/testdata/testinput2
|
||
|
@@ -4823,4 +4823,6 @@ a)"xI
|
||
|
|
||
|
/(?<R>abc)(?(R)xyz)/B
|
||
|
|
||
|
+/(?=.*[A-Z])/I
|
||
|
+
|
||
|
# End of testinput2
|
||
|
diff --git a/testdata/testinput4 b/testdata/testinput4
|
||
|
index ce9145d..73582b7 100644
|
||
|
--- a/testdata/testinput4
|
||
|
+++ b/testdata/testinput4
|
||
|
@@ -2282,4 +2282,10 @@
|
||
|
\x{389}
|
||
|
\x{20ac}
|
||
|
|
||
|
+/(?=.*b)\pL/
|
||
|
+ 11bb
|
||
|
+
|
||
|
+/(?(?=.*b)(?=.*b)\pL|.*c)/
|
||
|
+ 11bb
|
||
|
+
|
||
|
# End of testinput4
|
||
|
diff --git a/testdata/testoutput1 b/testdata/testoutput1
|
||
|
index d28bf91..02e07bf 100644
|
||
|
--- a/testdata/testoutput1
|
||
|
+++ b/testdata/testoutput1
|
||
|
@@ -9257,4 +9257,12 @@ No match
|
||
|
1: b
|
||
|
2: cccc
|
||
|
|
||
|
+/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/
|
||
|
+ \ Fred:099
|
||
|
+ 0:
|
||
|
+
|
||
|
+/(?=.*X)X$/
|
||
|
+ \ X
|
||
|
+ 0: X
|
||
|
+
|
||
|
# End of testinput1
|
||
|
diff --git a/testdata/testoutput2 b/testdata/testoutput2
|
||
|
index ef7e329..bdb289d 100644
|
||
|
--- a/testdata/testoutput2
|
||
|
+++ b/testdata/testoutput2
|
||
|
@@ -8751,7 +8751,6 @@ Subject length lower bound = 1
|
||
|
|
||
|
/(?(?=.*b).*b|^d)/I
|
||
|
Capturing subpattern count = 0
|
||
|
-First code unit at start or follows newline
|
||
|
Subject length lower bound = 1
|
||
|
|
||
|
/xyz/auto_callout
|
||
|
@@ -15223,6 +15222,11 @@ No match
|
||
|
End
|
||
|
------------------------------------------------------------------
|
||
|
|
||
|
+/(?=.*[A-Z])/I
|
||
|
+Capturing subpattern count = 0
|
||
|
+May match empty string
|
||
|
+Subject length lower bound = 0
|
||
|
+
|
||
|
# End of testinput2
|
||
|
Error -63: PCRE2_ERROR_BADDATA (unknown error number)
|
||
|
Error -62: bad serialized data
|
||
|
diff --git a/testdata/testoutput4 b/testdata/testoutput4
|
||
|
index 701d411..d2d5e51 100644
|
||
|
--- a/testdata/testoutput4
|
||
|
+++ b/testdata/testoutput4
|
||
|
@@ -3703,4 +3703,12 @@ No match
|
||
|
\x{20ac}
|
||
|
No match
|
||
|
|
||
|
+/(?=.*b)\pL/
|
||
|
+ 11bb
|
||
|
+ 0: b
|
||
|
+
|
||
|
+/(?(?=.*b)(?=.*b)\pL|.*c)/
|
||
|
+ 11bb
|
||
|
+ 0: b
|
||
|
+
|
||
|
# End of testinput4
|
||
|
--
|
||
|
2.7.4
|
||
|
|