From d25a61fd454d7089b7132154c1697c1dd5c3773b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20P=C3=ADsa=C5=99?= Date: Wed, 19 Oct 2016 10:02:27 +0200 Subject: [PATCH] Fix optimization bugs for patterns starting with lookaheads --- ...-bugs-for-patterns-starting-with-loo.patch | 242 ++++++++++++++++++ pcre.spec | 10 +- 2 files changed, 250 insertions(+), 2 deletions(-) create mode 100644 pcre-8.39-Fix-optimization-bugs-for-patterns-starting-with-loo.patch diff --git a/pcre-8.39-Fix-optimization-bugs-for-patterns-starting-with-loo.patch b/pcre-8.39-Fix-optimization-bugs-for-patterns-starting-with-loo.patch new file mode 100644 index 0000000..e038dfd --- /dev/null +++ b/pcre-8.39-Fix-optimization-bugs-for-patterns-starting-with-loo.patch @@ -0,0 +1,242 @@ +From d6d45d15235fc6f010cfb193db8fb672a152e41c Mon Sep 17 00:00:00 2001 +From: ph10 +Date: Tue, 18 Oct 2016 15:10:09 +0000 +Subject: [PATCH] Fix optimization bugs for patterns starting with lookaheads. +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1669 2f5784b3-3f2a-0410-8824-cb99058d5e15 + +Petr Písař: Ported to 8.39. + +diff --git a/pcre_compile.c b/pcre_compile.c +index 67c74e8..de92313 100644 +--- a/pcre_compile.c ++++ b/pcre_compile.c +@@ -7918,15 +7918,17 @@ for (;; ptr++) + } + } + +- /* For a forward assertion, we take the reqchar, if set. This can be +- helpful if the pattern that follows the assertion doesn't set a different +- char. For example, it's useful for /(?=abcde).+/. We can't set firstchar +- for an assertion, however because it leads to incorrect effect for patterns +- such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead +- of a firstchar. This is overcome by a scan at the end if there's no +- firstchar, looking for an asserted first char. */ +- +- else if (bravalue == OP_ASSERT && subreqcharflags >= 0) ++ /* For a forward assertion, we take the reqchar, if set, provided that the ++ group has also set a first char. This can be helpful if the pattern that ++ follows the assertion doesn't set a different char. For example, it's ++ useful for /(?=abcde).+/. We can't set firstchar for an assertion, however ++ because it leads to incorrect effect for patterns such as /(?=a)a.+/ when ++ the "real" "a" would then become a reqchar instead of a firstchar. This is ++ overcome by a scan at the end if there's no firstchar, looking for an ++ asserted first char. */ ++ ++ else if (bravalue == OP_ASSERT && subreqcharflags >= 0 && ++ subfirstcharflags >= 0) + { + reqchar = subreqchar; + reqcharflags = subreqcharflags; +@@ -8715,8 +8717,8 @@ matching and for non-DOTALL patterns that start with .* (which must start at + the beginning or after \n). As in the case of is_anchored() (see above), we + have to take account of back references to capturing brackets that contain .* + because in that case we can't make the assumption. Also, the appearance of .* +-inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not +-count, because once again the assumption no longer holds. ++inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE ++or *SKIP does not count, because once again the assumption no longer holds. + + Arguments: + code points to start of expression (the bracket) +@@ -8725,13 +8727,14 @@ Arguments: + the less precise approach + cd points to the compile data + atomcount atomic group level ++ inassert TRUE if in an assertion + + Returns: TRUE or FALSE + */ + + static BOOL + is_startline(const pcre_uchar *code, unsigned int bracket_map, +- compile_data *cd, int atomcount) ++ compile_data *cd, int atomcount, BOOL inassert) + { + do { + const pcre_uchar *scode = first_significant_code( +@@ -8758,7 +8761,7 @@ do { + return FALSE; + + default: /* Assertion */ +- if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE; ++ if (!is_startline(scode, bracket_map, cd, atomcount, TRUE)) return FALSE; + do scode += GET(scode, 1); while (*scode == OP_ALT); + scode += 1 + LINK_SIZE; + break; +@@ -8772,7 +8775,7 @@ do { + if (op == OP_BRA || op == OP_BRAPOS || + op == OP_SBRA || op == OP_SBRAPOS) + { +- if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE; ++ if (!is_startline(scode, bracket_map, cd, atomcount, inassert)) return FALSE; + } + + /* Capturing brackets */ +@@ -8782,33 +8785,33 @@ do { + { + int n = GET2(scode, 1+LINK_SIZE); + int new_map = bracket_map | ((n < 32)? (1 << n) : 1); +- if (!is_startline(scode, new_map, cd, atomcount)) return FALSE; ++ if (!is_startline(scode, new_map, cd, atomcount, inassert)) return FALSE; + } + + /* Positive forward assertions */ + + else if (op == OP_ASSERT) + { +- if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE; ++ if (!is_startline(scode, bracket_map, cd, atomcount, TRUE)) return FALSE; + } + + /* Atomic brackets */ + + else if (op == OP_ONCE || op == OP_ONCE_NC) + { +- if (!is_startline(scode, bracket_map, cd, atomcount + 1)) return FALSE; ++ if (!is_startline(scode, bracket_map, cd, atomcount + 1, inassert)) return FALSE; + } + + /* .* means "start at start or after \n" if it isn't in atomic brackets or +- brackets that may be referenced, as long as the pattern does not contain +- *PRUNE or *SKIP, because these break the feature. Consider, for example, +- /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the +- start of a line. */ ++ brackets that may be referenced or an assertion, as long as the pattern does ++ not contain *PRUNE or *SKIP, because these break the feature. Consider, for ++ example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. ++ not at the start of a line. */ + + else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR) + { + if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 || +- atomcount > 0 || cd->had_pruneorskip) ++ atomcount > 0 || cd->had_pruneorskip || inassert) + return FALSE; + } + +@@ -9663,7 +9666,7 @@ if ((re->options & PCRE_ANCHORED) == 0) + re->flags |= PCRE_FIRSTSET; + } + +- else if (is_startline(codestart, 0, cd, 0)) re->flags |= PCRE_STARTLINE; ++ else if (is_startline(codestart, 0, cd, 0, FALSE)) re->flags |= PCRE_STARTLINE; + } + } + +diff --git a/testdata/testinput1 b/testdata/testinput1 +index 8379ce0..93abab3 100644 +--- a/testdata/testinput1 ++++ b/testdata/testinput1 +@@ -5733,4 +5733,10 @@ AbcdCBefgBhiBqz + "(?|(\k'Pm')|(?'Pm'))" + abcd + ++/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/ ++ \ Fred:099 ++ ++/(?=.*X)X$/ ++ \ X ++ + /-- End of testinput1 --/ +diff --git a/testdata/testinput2 b/testdata/testinput2 +index 38346ef..08c6f39 100644 +--- a/testdata/testinput2 ++++ b/testdata/testinput2 +@@ -4247,4 +4247,6 @@ backtracking verbs. --/ + + /(?abc)(?(R)xyz)/BZ + ++/(?=.*[A-Z])/I ++ + /-- End of testinput2 --/ +diff --git a/testdata/testinput6 b/testdata/testinput6 +index a178d3d..22ed1e6 100644 +--- a/testdata/testinput6 ++++ b/testdata/testinput6 +@@ -1562,4 +1562,10 @@ + \x{389} + \x{20ac} + ++/(?=.*b)\pL/ ++ 11bb ++ ++/(?(?=.*b)(?=.*b)\pL|.*c)/ ++ 11bb ++ + /-- End of testinput6 --/ +diff --git a/testdata/testoutput1 b/testdata/testoutput1 +index e852ab9..a2b3cff 100644 +--- a/testdata/testoutput1 ++++ b/testdata/testoutput1 +@@ -9434,4 +9434,12 @@ No match + 0: + 1: + ++/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/ ++ \ Fred:099 ++ 0: ++ ++/(?=.*X)X$/ ++ \ X ++ 0: X ++ + /-- End of testinput1 --/ +diff --git a/testdata/testoutput2 b/testdata/testoutput2 +index 216bff8..811bbef 100644 +--- a/testdata/testoutput2 ++++ b/testdata/testoutput2 +@@ -9380,7 +9380,7 @@ No need char + /(?(?=.*b).*b|^d)/I + Capturing subpattern count = 0 + No options +-First char at start or follows newline ++No first char + No need char + + /xyz/C +@@ -14698,4 +14698,11 @@ Failed: assertion expected after (?( or (?(?C) at offset 4 + End + ------------------------------------------------------------------ + ++/(?=.*[A-Z])/I ++Capturing subpattern count = 0 ++May match empty string ++No options ++No first char ++No need char ++ + /-- End of testinput2 --/ +diff --git a/testdata/testoutput6 b/testdata/testoutput6 +index b64dc0d..422d383 100644 +--- a/testdata/testoutput6 ++++ b/testdata/testoutput6 +@@ -2573,4 +2573,12 @@ No match + \x{20ac} + No match + ++/(?=.*b)\pL/ ++ 11bb ++ 0: b ++ ++/(?(?=.*b)(?=.*b)\pL|.*c)/ ++ 11bb ++ 0: b ++ + /-- End of testinput6 --/ +-- +2.7.4 + diff --git a/pcre.spec b/pcre.spec index 22a9778..d48ede3 100644 --- a/pcre.spec +++ b/pcre.spec @@ -2,7 +2,7 @@ #%%global rcversion RC1 Name: pcre Version: 8.39 -Release: %{?rcversion:0.}4%{?rcversion:.%rcversion}%{?dist}.1 +Release: %{?rcversion:0.}5%{?rcversion:.%rcversion}%{?dist} %global myversion %{version}%{?rcversion:-%rcversion} Summary: Perl-compatible regular expression library Group: System Environment/Libraries @@ -56,6 +56,9 @@ Patch7: pcre-8.39-Fix-bad-conditional-recursion-test-bug-when-a-group-.patch # Fix internal option documentation in pcrepattern(3), in upstream after 8.39, # upstream bug #1875 Patch8: pcre-8.39-Fix-documentation-error.patch +# Fix optimization bugs for patterns starting with lookaheads, +# in upstream after 8.39, upstream bug #1882 +Patch9: pcre-8.39-Fix-optimization-bugs-for-patterns-starting-with-loo.patch BuildRequires: readline-devel BuildRequires: autoconf BuildRequires: automake @@ -152,6 +155,7 @@ Utilities demonstrating PCRE capabilities like pcregrep or pcretest. %patch6 -p1 %patch7 -p1 %patch8 -p1 +%patch9 -p1 # Because of rpath patch libtoolize --copy --force autoreconf -vif @@ -248,8 +252,10 @@ make %{?_smp_mflags} check VERBOSE=yes %{_mandir}/man1/pcretest.* %changelog -* Wed Oct 19 2016 Petr Pisar - 8.39-4.1 +* Wed Oct 19 2016 Petr Pisar - 8.39-5 - Fix internal option documentation in pcrepattern(3) (upstream bug #1875) +- Fix optimization bugs for patterns starting with lookaheads + (upstream bug #1882) * Fri Oct 14 2016 Petr Pisar - 8.39-4 - Fix displaying position in pcretest callout with an escape sequence greater