Fix optimization bugs for patterns starting with lookaheads
This commit is contained in:
parent
188762109b
commit
d25a61fd45
@ -0,0 +1,242 @@
|
|||||||
|
From d6d45d15235fc6f010cfb193db8fb672a152e41c Mon Sep 17 00:00:00 2001
|
||||||
|
From: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>
|
||||||
|
Date: Tue, 18 Oct 2016 15:10:09 +0000
|
||||||
|
Subject: [PATCH] Fix optimization bugs for patterns starting with lookaheads.
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1669 2f5784b3-3f2a-0410-8824-cb99058d5e15
|
||||||
|
|
||||||
|
Petr Písař: Ported to 8.39.
|
||||||
|
|
||||||
|
diff --git a/pcre_compile.c b/pcre_compile.c
|
||||||
|
index 67c74e8..de92313 100644
|
||||||
|
--- a/pcre_compile.c
|
||||||
|
+++ b/pcre_compile.c
|
||||||
|
@@ -7918,15 +7918,17 @@ for (;; ptr++)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
- /* For a forward assertion, we take the reqchar, if set. This can be
|
||||||
|
- helpful if the pattern that follows the assertion doesn't set a different
|
||||||
|
- char. For example, it's useful for /(?=abcde).+/. We can't set firstchar
|
||||||
|
- for an assertion, however because it leads to incorrect effect for patterns
|
||||||
|
- such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead
|
||||||
|
- of a firstchar. This is overcome by a scan at the end if there's no
|
||||||
|
- firstchar, looking for an asserted first char. */
|
||||||
|
-
|
||||||
|
- else if (bravalue == OP_ASSERT && subreqcharflags >= 0)
|
||||||
|
+ /* For a forward assertion, we take the reqchar, if set, provided that the
|
||||||
|
+ group has also set a first char. This can be helpful if the pattern that
|
||||||
|
+ follows the assertion doesn't set a different char. For example, it's
|
||||||
|
+ useful for /(?=abcde).+/. We can't set firstchar for an assertion, however
|
||||||
|
+ because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
|
||||||
|
+ the "real" "a" would then become a reqchar instead of a firstchar. This is
|
||||||
|
+ overcome by a scan at the end if there's no firstchar, looking for an
|
||||||
|
+ asserted first char. */
|
||||||
|
+
|
||||||
|
+ else if (bravalue == OP_ASSERT && subreqcharflags >= 0 &&
|
||||||
|
+ subfirstcharflags >= 0)
|
||||||
|
{
|
||||||
|
reqchar = subreqchar;
|
||||||
|
reqcharflags = subreqcharflags;
|
||||||
|
@@ -8715,8 +8717,8 @@ matching and for non-DOTALL patterns that start with .* (which must start at
|
||||||
|
the beginning or after \n). As in the case of is_anchored() (see above), we
|
||||||
|
have to take account of back references to capturing brackets that contain .*
|
||||||
|
because in that case we can't make the assumption. Also, the appearance of .*
|
||||||
|
-inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
|
||||||
|
-count, because once again the assumption no longer holds.
|
||||||
|
+inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
|
||||||
|
+or *SKIP does not count, because once again the assumption no longer holds.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
code points to start of expression (the bracket)
|
||||||
|
@@ -8725,13 +8727,14 @@ Arguments:
|
||||||
|
the less precise approach
|
||||||
|
cd points to the compile data
|
||||||
|
atomcount atomic group level
|
||||||
|
+ inassert TRUE if in an assertion
|
||||||
|
|
||||||
|
Returns: TRUE or FALSE
|
||||||
|
*/
|
||||||
|
|
||||||
|
static BOOL
|
||||||
|
is_startline(const pcre_uchar *code, unsigned int bracket_map,
|
||||||
|
- compile_data *cd, int atomcount)
|
||||||
|
+ compile_data *cd, int atomcount, BOOL inassert)
|
||||||
|
{
|
||||||
|
do {
|
||||||
|
const pcre_uchar *scode = first_significant_code(
|
||||||
|
@@ -8758,7 +8761,7 @@ do {
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
default: /* Assertion */
|
||||||
|
- if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
|
||||||
|
+ if (!is_startline(scode, bracket_map, cd, atomcount, TRUE)) return FALSE;
|
||||||
|
do scode += GET(scode, 1); while (*scode == OP_ALT);
|
||||||
|
scode += 1 + LINK_SIZE;
|
||||||
|
break;
|
||||||
|
@@ -8772,7 +8775,7 @@ do {
|
||||||
|
if (op == OP_BRA || op == OP_BRAPOS ||
|
||||||
|
op == OP_SBRA || op == OP_SBRAPOS)
|
||||||
|
{
|
||||||
|
- if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
|
||||||
|
+ if (!is_startline(scode, bracket_map, cd, atomcount, inassert)) return FALSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Capturing brackets */
|
||||||
|
@@ -8782,33 +8785,33 @@ do {
|
||||||
|
{
|
||||||
|
int n = GET2(scode, 1+LINK_SIZE);
|
||||||
|
int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
|
||||||
|
- if (!is_startline(scode, new_map, cd, atomcount)) return FALSE;
|
||||||
|
+ if (!is_startline(scode, new_map, cd, atomcount, inassert)) return FALSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Positive forward assertions */
|
||||||
|
|
||||||
|
else if (op == OP_ASSERT)
|
||||||
|
{
|
||||||
|
- if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
|
||||||
|
+ if (!is_startline(scode, bracket_map, cd, atomcount, TRUE)) return FALSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Atomic brackets */
|
||||||
|
|
||||||
|
else if (op == OP_ONCE || op == OP_ONCE_NC)
|
||||||
|
{
|
||||||
|
- if (!is_startline(scode, bracket_map, cd, atomcount + 1)) return FALSE;
|
||||||
|
+ if (!is_startline(scode, bracket_map, cd, atomcount + 1, inassert)) return FALSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* .* means "start at start or after \n" if it isn't in atomic brackets or
|
||||||
|
- brackets that may be referenced, as long as the pattern does not contain
|
||||||
|
- *PRUNE or *SKIP, because these break the feature. Consider, for example,
|
||||||
|
- /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
|
||||||
|
- start of a line. */
|
||||||
|
+ brackets that may be referenced or an assertion, as long as the pattern does
|
||||||
|
+ not contain *PRUNE or *SKIP, because these break the feature. Consider, for
|
||||||
|
+ example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e.
|
||||||
|
+ not at the start of a line. */
|
||||||
|
|
||||||
|
else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
|
||||||
|
{
|
||||||
|
if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 ||
|
||||||
|
- atomcount > 0 || cd->had_pruneorskip)
|
||||||
|
+ atomcount > 0 || cd->had_pruneorskip || inassert)
|
||||||
|
return FALSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
@@ -9663,7 +9666,7 @@ if ((re->options & PCRE_ANCHORED) == 0)
|
||||||
|
re->flags |= PCRE_FIRSTSET;
|
||||||
|
}
|
||||||
|
|
||||||
|
- else if (is_startline(codestart, 0, cd, 0)) re->flags |= PCRE_STARTLINE;
|
||||||
|
+ else if (is_startline(codestart, 0, cd, 0, FALSE)) re->flags |= PCRE_STARTLINE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
diff --git a/testdata/testinput1 b/testdata/testinput1
|
||||||
|
index 8379ce0..93abab3 100644
|
||||||
|
--- a/testdata/testinput1
|
||||||
|
+++ b/testdata/testinput1
|
||||||
|
@@ -5733,4 +5733,10 @@ AbcdCBefgBhiBqz
|
||||||
|
"(?|(\k'Pm')|(?'Pm'))"
|
||||||
|
abcd
|
||||||
|
|
||||||
|
+/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/
|
||||||
|
+ \ Fred:099
|
||||||
|
+
|
||||||
|
+/(?=.*X)X$/
|
||||||
|
+ \ X
|
||||||
|
+
|
||||||
|
/-- End of testinput1 --/
|
||||||
|
diff --git a/testdata/testinput2 b/testdata/testinput2
|
||||||
|
index 38346ef..08c6f39 100644
|
||||||
|
--- a/testdata/testinput2
|
||||||
|
+++ b/testdata/testinput2
|
||||||
|
@@ -4247,4 +4247,6 @@ backtracking verbs. --/
|
||||||
|
|
||||||
|
/(?<R>abc)(?(R)xyz)/BZ
|
||||||
|
|
||||||
|
+/(?=.*[A-Z])/I
|
||||||
|
+
|
||||||
|
/-- End of testinput2 --/
|
||||||
|
diff --git a/testdata/testinput6 b/testdata/testinput6
|
||||||
|
index a178d3d..22ed1e6 100644
|
||||||
|
--- a/testdata/testinput6
|
||||||
|
+++ b/testdata/testinput6
|
||||||
|
@@ -1562,4 +1562,10 @@
|
||||||
|
\x{389}
|
||||||
|
\x{20ac}
|
||||||
|
|
||||||
|
+/(?=.*b)\pL/
|
||||||
|
+ 11bb
|
||||||
|
+
|
||||||
|
+/(?(?=.*b)(?=.*b)\pL|.*c)/
|
||||||
|
+ 11bb
|
||||||
|
+
|
||||||
|
/-- End of testinput6 --/
|
||||||
|
diff --git a/testdata/testoutput1 b/testdata/testoutput1
|
||||||
|
index e852ab9..a2b3cff 100644
|
||||||
|
--- a/testdata/testoutput1
|
||||||
|
+++ b/testdata/testoutput1
|
||||||
|
@@ -9434,4 +9434,12 @@ No match
|
||||||
|
0:
|
||||||
|
1:
|
||||||
|
|
||||||
|
+/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/
|
||||||
|
+ \ Fred:099
|
||||||
|
+ 0:
|
||||||
|
+
|
||||||
|
+/(?=.*X)X$/
|
||||||
|
+ \ X
|
||||||
|
+ 0: X
|
||||||
|
+
|
||||||
|
/-- End of testinput1 --/
|
||||||
|
diff --git a/testdata/testoutput2 b/testdata/testoutput2
|
||||||
|
index 216bff8..811bbef 100644
|
||||||
|
--- a/testdata/testoutput2
|
||||||
|
+++ b/testdata/testoutput2
|
||||||
|
@@ -9380,7 +9380,7 @@ No need char
|
||||||
|
/(?(?=.*b).*b|^d)/I
|
||||||
|
Capturing subpattern count = 0
|
||||||
|
No options
|
||||||
|
-First char at start or follows newline
|
||||||
|
+No first char
|
||||||
|
No need char
|
||||||
|
|
||||||
|
/xyz/C
|
||||||
|
@@ -14698,4 +14698,11 @@ Failed: assertion expected after (?( or (?(?C) at offset 4
|
||||||
|
End
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
+/(?=.*[A-Z])/I
|
||||||
|
+Capturing subpattern count = 0
|
||||||
|
+May match empty string
|
||||||
|
+No options
|
||||||
|
+No first char
|
||||||
|
+No need char
|
||||||
|
+
|
||||||
|
/-- End of testinput2 --/
|
||||||
|
diff --git a/testdata/testoutput6 b/testdata/testoutput6
|
||||||
|
index b64dc0d..422d383 100644
|
||||||
|
--- a/testdata/testoutput6
|
||||||
|
+++ b/testdata/testoutput6
|
||||||
|
@@ -2573,4 +2573,12 @@ No match
|
||||||
|
\x{20ac}
|
||||||
|
No match
|
||||||
|
|
||||||
|
+/(?=.*b)\pL/
|
||||||
|
+ 11bb
|
||||||
|
+ 0: b
|
||||||
|
+
|
||||||
|
+/(?(?=.*b)(?=.*b)\pL|.*c)/
|
||||||
|
+ 11bb
|
||||||
|
+ 0: b
|
||||||
|
+
|
||||||
|
/-- End of testinput6 --/
|
||||||
|
--
|
||||||
|
2.7.4
|
||||||
|
|
10
pcre.spec
10
pcre.spec
@ -2,7 +2,7 @@
|
|||||||
#%%global rcversion RC1
|
#%%global rcversion RC1
|
||||||
Name: pcre
|
Name: pcre
|
||||||
Version: 8.39
|
Version: 8.39
|
||||||
Release: %{?rcversion:0.}4%{?rcversion:.%rcversion}%{?dist}.1
|
Release: %{?rcversion:0.}5%{?rcversion:.%rcversion}%{?dist}
|
||||||
%global myversion %{version}%{?rcversion:-%rcversion}
|
%global myversion %{version}%{?rcversion:-%rcversion}
|
||||||
Summary: Perl-compatible regular expression library
|
Summary: Perl-compatible regular expression library
|
||||||
Group: System Environment/Libraries
|
Group: System Environment/Libraries
|
||||||
@ -56,6 +56,9 @@ Patch7: pcre-8.39-Fix-bad-conditional-recursion-test-bug-when-a-group-.patch
|
|||||||
# Fix internal option documentation in pcrepattern(3), in upstream after 8.39,
|
# Fix internal option documentation in pcrepattern(3), in upstream after 8.39,
|
||||||
# upstream bug #1875
|
# upstream bug #1875
|
||||||
Patch8: pcre-8.39-Fix-documentation-error.patch
|
Patch8: pcre-8.39-Fix-documentation-error.patch
|
||||||
|
# Fix optimization bugs for patterns starting with lookaheads,
|
||||||
|
# in upstream after 8.39, upstream bug #1882
|
||||||
|
Patch9: pcre-8.39-Fix-optimization-bugs-for-patterns-starting-with-loo.patch
|
||||||
BuildRequires: readline-devel
|
BuildRequires: readline-devel
|
||||||
BuildRequires: autoconf
|
BuildRequires: autoconf
|
||||||
BuildRequires: automake
|
BuildRequires: automake
|
||||||
@ -152,6 +155,7 @@ Utilities demonstrating PCRE capabilities like pcregrep or pcretest.
|
|||||||
%patch6 -p1
|
%patch6 -p1
|
||||||
%patch7 -p1
|
%patch7 -p1
|
||||||
%patch8 -p1
|
%patch8 -p1
|
||||||
|
%patch9 -p1
|
||||||
# Because of rpath patch
|
# Because of rpath patch
|
||||||
libtoolize --copy --force
|
libtoolize --copy --force
|
||||||
autoreconf -vif
|
autoreconf -vif
|
||||||
@ -248,8 +252,10 @@ make %{?_smp_mflags} check VERBOSE=yes
|
|||||||
%{_mandir}/man1/pcretest.*
|
%{_mandir}/man1/pcretest.*
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
* Wed Oct 19 2016 Petr Pisar <ppisar@redhat.com> - 8.39-4.1
|
* Wed Oct 19 2016 Petr Pisar <ppisar@redhat.com> - 8.39-5
|
||||||
- Fix internal option documentation in pcrepattern(3) (upstream bug #1875)
|
- Fix internal option documentation in pcrepattern(3) (upstream bug #1875)
|
||||||
|
- Fix optimization bugs for patterns starting with lookaheads
|
||||||
|
(upstream bug #1882)
|
||||||
|
|
||||||
* Fri Oct 14 2016 Petr Pisar <ppisar@redhat.com> - 8.39-4
|
* Fri Oct 14 2016 Petr Pisar <ppisar@redhat.com> - 8.39-4
|
||||||
- Fix displaying position in pcretest callout with an escape sequence greater
|
- Fix displaying position in pcretest callout with an escape sequence greater
|
||||||
|
Loading…
Reference in New Issue
Block a user