From 14e4e937026b33da240cd1e8d136e663bc3d6e10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20P=C3=ADsa=C5=99?= Date: Wed, 19 Oct 2016 13:38:16 +0200 Subject: [PATCH] Fix optimization bugs for patterns starting with lookaheads --- ...-bugs-when-pattern-starts-with-looka.patch | 280 ++++++++++++++++++ pcre2.spec | 6 + 2 files changed, 286 insertions(+) create mode 100644 pcre2-10.22-Fix-optimization-bugs-when-pattern-starts-with-looka.patch diff --git a/pcre2-10.22-Fix-optimization-bugs-when-pattern-starts-with-looka.patch b/pcre2-10.22-Fix-optimization-bugs-when-pattern-starts-with-looka.patch new file mode 100644 index 0000000..9704255 --- /dev/null +++ b/pcre2-10.22-Fix-optimization-bugs-when-pattern-starts-with-looka.patch @@ -0,0 +1,280 @@ +From 95d88c4044fb74af8c3259db6413214fe9a106e2 Mon Sep 17 00:00:00 2001 +From: ph10 +Date: Tue, 18 Oct 2016 11:22:40 +0000 +Subject: [PATCH] Fix optimization bugs when pattern starts with lookahead. +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@573 6239d852-aaf2-0410-a92c-79f79f948069 + +Petr Písař: Ported to 10.22. + +Signed-off-by: Petr Písař +--- + src/pcre2_compile.c | 63 ++++++++++++++++++++++++++++++---------------------- + testdata/testinput1 | 6 +++++ + testdata/testinput2 | 2 ++ + testdata/testinput4 | 6 +++++ + testdata/testoutput1 | 8 +++++++ + testdata/testoutput2 | 6 ++++- + testdata/testoutput4 | 8 +++++++ + 7 files changed, 72 insertions(+), 27 deletions(-) + +diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c +index b3ebb25..fe37310 100644 +--- a/src/pcre2_compile.c ++++ b/src/pcre2_compile.c +@@ -5553,8 +5553,9 @@ for (;; ptr++) + *lengthptr += delta; + } + +- /* This is compiling for real. If there is a set first byte for +- the group, and we have not yet set a "required byte", set it. */ ++ /* This is compiling for real. If there is a set first code unit ++ for the group, and we have not yet set a "required code unit", set ++ it. */ + + else + { +@@ -7185,7 +7186,7 @@ for (;; ptr++) + zerofirstcuflags = firstcuflags; + groupsetfirstcu = FALSE; + +- if (bravalue >= OP_ONCE) ++ if (bravalue >= OP_ONCE) /* Not an assertion */ + { + /* If we have not yet set a firstcu in this branch, take it from the + subpattern, remembering that it was set here so that a repeat of more +@@ -7225,15 +7226,19 @@ for (;; ptr++) + } + } + +- /* For a forward assertion, we take the reqcu, if set. This can be +- helpful if the pattern that follows the assertion doesn't set a different +- char. For example, it's useful for /(?=abcde).+/. We can't set firstcu +- for an assertion, however because it leads to incorrect effect for patterns +- such as /(?=a)a.+/ when the "real" "a" would then become a reqcu instead +- of a firstcu. This is overcome by a scan at the end if there's no +- firstcu, looking for an asserted first char. */ +- +- else if (bravalue == OP_ASSERT && subreqcuflags >= 0) ++ /* For a forward assertion, we take the reqcu, if set, provided that the ++ group has also set a firstcu. This can be helpful if the pattern that ++ follows the assertion doesn't set a different char. For example, it's ++ useful for /(?=abcde).+/. We can't set firstcu for an assertion, however ++ because it leads to incorrect effect for patterns such as /(?=a)a.+/ when ++ the "real" "a" would then become a reqcu instead of a firstcu. This is ++ overcome by a scan at the end if there's no firstcu, looking for an ++ asserted first char. A similar effect for patterns like /(?=.*X)X$/ means ++ we must only take the reqcu when the group also set a firstcu. Otherwise, ++ in that example, 'X' ends up set for both. */ ++ ++ else if (bravalue == OP_ASSERT && subreqcuflags >= 0 && ++ subfirstcuflags >= 0) + { + reqcu = subreqcu; + reqcuflags = subreqcuflags; +@@ -8036,8 +8041,8 @@ matching and for non-DOTALL patterns that start with .* (which must start at + the beginning or after \n). As in the case of is_anchored() (see above), we + have to take account of back references to capturing brackets that contain .* + because in that case we can't make the assumption. Also, the appearance of .* +-inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not +-count, because once again the assumption no longer holds. ++inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE ++or *SKIP does not count, because once again the assumption no longer holds. + + Arguments: + code points to start of the compiled pattern or a group +@@ -8046,13 +8051,14 @@ Arguments: + the less precise approach + cb points to the compile data + atomcount atomic group level ++ inassert TRUE if in an assertion + + Returns: TRUE or FALSE + */ + + static BOOL + is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb, +- int atomcount) ++ int atomcount, BOOL inassert) + { + do { + PCRE2_SPTR scode = first_significant_code( +@@ -8083,7 +8089,7 @@ do { + return FALSE; + + default: /* Assertion */ +- if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE; ++ if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE; + do scode += GET(scode, 1); while (*scode == OP_ALT); + scode += 1 + LINK_SIZE; + break; +@@ -8097,7 +8103,8 @@ do { + if (op == OP_BRA || op == OP_BRAPOS || + op == OP_SBRA || op == OP_SBRAPOS) + { +- if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE; ++ if (!is_startline(scode, bracket_map, cb, atomcount, inassert)) ++ return FALSE; + } + + /* Capturing brackets */ +@@ -8107,33 +8114,36 @@ do { + { + int n = GET2(scode, 1+LINK_SIZE); + int new_map = bracket_map | ((n < 32)? (1u << n) : 1); +- if (!is_startline(scode, new_map, cb, atomcount)) return FALSE; ++ if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE; + } + + /* Positive forward assertions */ + + else if (op == OP_ASSERT) + { +- if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE; ++ if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) ++ return FALSE; + } + + /* Atomic brackets */ + + else if (op == OP_ONCE || op == OP_ONCE_NC) + { +- if (!is_startline(scode, bracket_map, cb, atomcount + 1)) return FALSE; ++ if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert)) ++ return FALSE; + } + + /* .* means "start at start or after \n" if it isn't in atomic brackets or +- brackets that may be referenced, as long as the pattern does not contain +- *PRUNE or *SKIP, because these break the feature. Consider, for example, +- /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the +- start of a line. There is also an option that disables this optimization. */ ++ brackets that may be referenced or an assertion, and as long as the pattern ++ does not contain *PRUNE or *SKIP, because these break the feature. Consider, ++ for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", ++ i.e. not at the start of a line. There is also an option that disables this ++ optimization. */ + + else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR) + { + if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 || +- atomcount > 0 || cb->had_pruneorskip || ++ atomcount > 0 || cb->had_pruneorskip || inassert || + (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0) + return FALSE; + } +@@ -9023,7 +9033,8 @@ if ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0) + when *PRUNE and SKIP are not present. (There is an option that disables this + case.) */ + +- else if (is_startline(codestart, 0, &cb, 0)) re->flags |= PCRE2_STARTLINE; ++ else if (is_startline(codestart, 0, &cb, 0, FALSE)) ++ re->flags |= PCRE2_STARTLINE; + } + + /* Handle the "required code unit", if one is set. In the case of an anchored +diff --git a/testdata/testinput1 b/testdata/testinput1 +index 6d7bc80..0d680d3 100644 +--- a/testdata/testinput1 ++++ b/testdata/testinput1 +@@ -5792,4 +5792,10 @@ name)/mark + aaaccccaaa + bccccb + ++/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/ ++ \ Fred:099 ++ ++/(?=.*X)X$/ ++ \ X ++ + # End of testinput1 +diff --git a/testdata/testinput2 b/testdata/testinput2 +index 2d4bbe5..94fbfab 100644 +--- a/testdata/testinput2 ++++ b/testdata/testinput2 +@@ -4823,4 +4823,6 @@ a)"xI + + /(?abc)(?(R)xyz)/B + ++/(?=.*[A-Z])/I ++ + # End of testinput2 +diff --git a/testdata/testinput4 b/testdata/testinput4 +index ce9145d..73582b7 100644 +--- a/testdata/testinput4 ++++ b/testdata/testinput4 +@@ -2282,4 +2282,10 @@ + \x{389} + \x{20ac} + ++/(?=.*b)\pL/ ++ 11bb ++ ++/(?(?=.*b)(?=.*b)\pL|.*c)/ ++ 11bb ++ + # End of testinput4 +diff --git a/testdata/testoutput1 b/testdata/testoutput1 +index d28bf91..02e07bf 100644 +--- a/testdata/testoutput1 ++++ b/testdata/testoutput1 +@@ -9257,4 +9257,12 @@ No match + 1: b + 2: cccc + ++/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/ ++ \ Fred:099 ++ 0: ++ ++/(?=.*X)X$/ ++ \ X ++ 0: X ++ + # End of testinput1 +diff --git a/testdata/testoutput2 b/testdata/testoutput2 +index ef7e329..bdb289d 100644 +--- a/testdata/testoutput2 ++++ b/testdata/testoutput2 +@@ -8751,7 +8751,6 @@ Subject length lower bound = 1 + + /(?(?=.*b).*b|^d)/I + Capturing subpattern count = 0 +-First code unit at start or follows newline + Subject length lower bound = 1 + + /xyz/auto_callout +@@ -15223,6 +15222,11 @@ No match + End + ------------------------------------------------------------------ + ++/(?=.*[A-Z])/I ++Capturing subpattern count = 0 ++May match empty string ++Subject length lower bound = 0 ++ + # End of testinput2 + Error -63: PCRE2_ERROR_BADDATA (unknown error number) + Error -62: bad serialized data +diff --git a/testdata/testoutput4 b/testdata/testoutput4 +index 701d411..d2d5e51 100644 +--- a/testdata/testoutput4 ++++ b/testdata/testoutput4 +@@ -3703,4 +3703,12 @@ No match + \x{20ac} + No match + ++/(?=.*b)\pL/ ++ 11bb ++ 0: b ++ ++/(?(?=.*b)(?=.*b)\pL|.*c)/ ++ 11bb ++ 0: b ++ + # End of testinput4 +-- +2.7.4 + diff --git a/pcre2.spec b/pcre2.spec index 6ed8f7a..0d62523 100644 --- a/pcre2.spec +++ b/pcre2.spec @@ -43,6 +43,9 @@ Patch6: pcre2-10.22-Add-test-for-bug-already-fixed-by-the-refactoring.patch # Fix internal option documentation in pcre2pattern(3), in upstream after 10.22, # upstream bug #1875 Patch7: pcre2-10.22-Fix-documentation-error.patch +# Fix optimization bugs for patterns starting with lookaheads, +# in upstream after 10.22, upstream bug #1882 +Patch8: pcre2-10.22-Fix-optimization-bugs-when-pattern-starts-with-looka.patch # New libtool to get rid of RPATH and to use distribution autotools BuildRequires: autoconf BuildRequires: automake @@ -127,6 +130,7 @@ Utilities demonstrating PCRE2 capabilities like pcre2grep or pcre2test. %patch5 -p1 %patch6 -p1 %patch7 -p1 +%patch8 -p1 # Because of multilib patch libtoolize --copy --force autoreconf -vif @@ -230,6 +234,8 @@ make %{?_smp_mflags} check VERBOSE=yes - Fix miscopmilation of conditionals when a group name start with "R" (upstream bug #1873) - Fix internal option documentation in pcre2pattern(3) (upstream bug #1875) +- Fix optimization bugs for patterns starting with lookaheads + (upstream bug #1882) * Mon Aug 29 2016 Petr Pisar - 10.22-2 - Fix matching characters above 255 when a negative character type was used