From 14e4e937026b33da240cd1e8d136e663bc3d6e10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Petr=20P=C3=ADsa=C5=99?= <ppisar@redhat.com>
Date: Wed, 19 Oct 2016 13:38:16 +0200
Subject: [PATCH] Fix optimization bugs for patterns starting with lookaheads

---
 ...-bugs-when-pattern-starts-with-looka.patch | 280 ++++++++++++++++++
 pcre2.spec                                    |   6 +
 2 files changed, 286 insertions(+)
 create mode 100644 pcre2-10.22-Fix-optimization-bugs-when-pattern-starts-with-looka.patch

diff --git a/pcre2-10.22-Fix-optimization-bugs-when-pattern-starts-with-looka.patch b/pcre2-10.22-Fix-optimization-bugs-when-pattern-starts-with-looka.patch
new file mode 100644
index 0000000..9704255
--- /dev/null
+++ b/pcre2-10.22-Fix-optimization-bugs-when-pattern-starts-with-looka.patch
@@ -0,0 +1,280 @@
+From 95d88c4044fb74af8c3259db6413214fe9a106e2 Mon Sep 17 00:00:00 2001
+From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
+Date: Tue, 18 Oct 2016 11:22:40 +0000
+Subject: [PATCH] Fix optimization bugs when pattern starts with lookahead.
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@573 6239d852-aaf2-0410-a92c-79f79f948069
+
+Petr Písař: Ported to 10.22.
+
+Signed-off-by: Petr Písař <ppisar@redhat.com>
+---
+ src/pcre2_compile.c  | 63 ++++++++++++++++++++++++++++++----------------------
+ testdata/testinput1  |  6 +++++
+ testdata/testinput2  |  2 ++
+ testdata/testinput4  |  6 +++++
+ testdata/testoutput1 |  8 +++++++
+ testdata/testoutput2 |  6 ++++-
+ testdata/testoutput4 |  8 +++++++
+ 7 files changed, 72 insertions(+), 27 deletions(-)
+
+diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
+index b3ebb25..fe37310 100644
+--- a/src/pcre2_compile.c
++++ b/src/pcre2_compile.c
+@@ -5553,8 +5553,9 @@ for (;; ptr++)
+             *lengthptr += delta;
+             }
+ 
+-          /* This is compiling for real. If there is a set first byte for
+-          the group, and we have not yet set a "required byte", set it. */
++            /* This is compiling for real. If there is a set first code unit
++            for the group, and we have not yet set a "required code unit", set
++            it. */
+ 
+           else
+             {
+@@ -7185,7 +7186,7 @@ for (;; ptr++)
+     zerofirstcuflags = firstcuflags;
+     groupsetfirstcu = FALSE;
+ 
+-    if (bravalue >= OP_ONCE)
++    if (bravalue >= OP_ONCE)  /* Not an assertion */
+       {
+       /* If we have not yet set a firstcu in this branch, take it from the
+       subpattern, remembering that it was set here so that a repeat of more
+@@ -7225,15 +7226,19 @@ for (;; ptr++)
+         }
+       }
+ 
+-    /* For a forward assertion, we take the reqcu, if set. This can be
+-    helpful if the pattern that follows the assertion doesn't set a different
+-    char. For example, it's useful for /(?=abcde).+/. We can't set firstcu
+-    for an assertion, however because it leads to incorrect effect for patterns
+-    such as /(?=a)a.+/ when the "real" "a" would then become a reqcu instead
+-    of a firstcu. This is overcome by a scan at the end if there's no
+-    firstcu, looking for an asserted first char. */
+-
+-    else if (bravalue == OP_ASSERT && subreqcuflags >= 0)
++    /* For a forward assertion, we take the reqcu, if set, provided that the
++    group has also set a firstcu. This can be helpful if the pattern that
++    follows the assertion doesn't set a different char. For example, it's
++    useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
++    because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
++    the "real" "a" would then become a reqcu instead of a firstcu. This is
++    overcome by a scan at the end if there's no firstcu, looking for an
++    asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
++    we must only take the reqcu when the group also set a firstcu. Otherwise,
++    in that example, 'X' ends up set for both. */
++
++    else if (bravalue == OP_ASSERT && subreqcuflags >= 0 &&
++             subfirstcuflags >= 0)
+       {
+       reqcu = subreqcu;
+       reqcuflags = subreqcuflags;
+@@ -8036,8 +8041,8 @@ matching and for non-DOTALL patterns that start with .* (which must start at
+ the beginning or after \n). As in the case of is_anchored() (see above), we
+ have to take account of back references to capturing brackets that contain .*
+ because in that case we can't make the assumption. Also, the appearance of .*
+-inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
+-count, because once again the assumption no longer holds.
++inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
++or *SKIP does not count, because once again the assumption no longer holds.
+ 
+ Arguments:
+   code           points to start of the compiled pattern or a group
+@@ -8046,13 +8051,14 @@ Arguments:
+                    the less precise approach
+   cb             points to the compile data
+   atomcount      atomic group level
++  inassert       TRUE if in an assertion
+ 
+ Returns:         TRUE or FALSE
+ */
+ 
+ static BOOL
+ is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
+-  int atomcount)
++  int atomcount, BOOL inassert)
+ {
+ do {
+    PCRE2_SPTR scode = first_significant_code(
+@@ -8083,7 +8089,7 @@ do {
+        return FALSE;
+ 
+        default:     /* Assertion */
+-       if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
++       if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
+        do scode += GET(scode, 1); while (*scode == OP_ALT);
+        scode += 1 + LINK_SIZE;
+        break;
+@@ -8097,7 +8103,8 @@ do {
+    if (op == OP_BRA  || op == OP_BRAPOS ||
+        op == OP_SBRA || op == OP_SBRAPOS)
+      {
+-     if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
++     if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
++       return FALSE;
+      }
+ 
+    /* Capturing brackets */
+@@ -8107,33 +8114,36 @@ do {
+      {
+      int n = GET2(scode, 1+LINK_SIZE);
+      int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
+-     if (!is_startline(scode, new_map, cb, atomcount)) return FALSE;
++     if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
+      }
+ 
+    /* Positive forward assertions */
+ 
+    else if (op == OP_ASSERT)
+      {
+-     if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
++     if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
++       return FALSE;
+      }
+ 
+    /* Atomic brackets */
+ 
+    else if (op == OP_ONCE || op == OP_ONCE_NC)
+      {
+-     if (!is_startline(scode, bracket_map, cb, atomcount + 1)) return FALSE;
++     if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
++       return FALSE;
+      }
+ 
+    /* .* means "start at start or after \n" if it isn't in atomic brackets or
+-   brackets that may be referenced, as long as the pattern does not contain
+-   *PRUNE or *SKIP, because these break the feature. Consider, for example,
+-   /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
+-   start of a line. There is also an option that disables this optimization. */
++   brackets that may be referenced or an assertion, and as long as the pattern
++   does not contain *PRUNE or *SKIP, because these break the feature. Consider,
++   for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
++   i.e. not at the start of a line. There is also an option that disables this
++   optimization. */
+ 
+    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
+      {
+      if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
+-         atomcount > 0 || cb->had_pruneorskip ||
++         atomcount > 0 || cb->had_pruneorskip || inassert ||
+          (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
+        return FALSE;
+      }
+@@ -9023,7 +9033,8 @@ if ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0)
+   when *PRUNE and SKIP are not present. (There is an option that disables this
+   case.) */
+ 
+-  else if (is_startline(codestart, 0, &cb, 0)) re->flags |= PCRE2_STARTLINE;
++  else if (is_startline(codestart, 0, &cb, 0, FALSE))
++    re->flags |= PCRE2_STARTLINE;
+   }
+ 
+ /* Handle the "required code unit", if one is set. In the case of an anchored
+diff --git a/testdata/testinput1 b/testdata/testinput1
+index 6d7bc80..0d680d3 100644
+--- a/testdata/testinput1
++++ b/testdata/testinput1
+@@ -5792,4 +5792,10 @@ name)/mark
+     aaaccccaaa
+     bccccb 
+ 
++/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/
++    \   Fred:099
++
++/(?=.*X)X$/ 
++    \  X
++
+ # End of testinput1 
+diff --git a/testdata/testinput2 b/testdata/testinput2
+index 2d4bbe5..94fbfab 100644
+--- a/testdata/testinput2
++++ b/testdata/testinput2
+@@ -4823,4 +4823,6 @@ a)"xI
+ 
+ /(?<R>abc)(?(R)xyz)/B
+ 
++/(?=.*[A-Z])/I
++
+ # End of testinput2 
+diff --git a/testdata/testinput4 b/testdata/testinput4
+index ce9145d..73582b7 100644
+--- a/testdata/testinput4
++++ b/testdata/testinput4
+@@ -2282,4 +2282,10 @@
+     \x{389}
+     \x{20ac}
+ 
++/(?=.*b)\pL/
++    11bb
++    
++/(?(?=.*b)(?=.*b)\pL|.*c)/
++    11bb
++
+ # End of testinput4
+diff --git a/testdata/testoutput1 b/testdata/testoutput1
+index d28bf91..02e07bf 100644
+--- a/testdata/testoutput1
++++ b/testdata/testoutput1
+@@ -9257,4 +9257,12 @@ No match
+  1: b
+  2: cccc
+ 
++/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/
++    \   Fred:099
++ 0: 
++
++/(?=.*X)X$/ 
++    \  X
++ 0: X
++
+ # End of testinput1 
+diff --git a/testdata/testoutput2 b/testdata/testoutput2
+index ef7e329..bdb289d 100644
+--- a/testdata/testoutput2
++++ b/testdata/testoutput2
+@@ -8751,7 +8751,6 @@ Subject length lower bound = 1
+ 
+ /(?(?=.*b).*b|^d)/I
+ Capturing subpattern count = 0
+-First code unit at start or follows newline
+ Subject length lower bound = 1
+ 
+ /xyz/auto_callout
+@@ -15223,6 +15222,11 @@ No match
+         End
+ ------------------------------------------------------------------
+ 
++/(?=.*[A-Z])/I
++Capturing subpattern count = 0
++May match empty string
++Subject length lower bound = 0
++
+ # End of testinput2 
+ Error -63: PCRE2_ERROR_BADDATA (unknown error number)
+ Error -62: bad serialized data
+diff --git a/testdata/testoutput4 b/testdata/testoutput4
+index 701d411..d2d5e51 100644
+--- a/testdata/testoutput4
++++ b/testdata/testoutput4
+@@ -3703,4 +3703,12 @@ No match
+     \x{20ac}
+ No match
+ 
++/(?=.*b)\pL/
++    11bb
++ 0: b
++    
++/(?(?=.*b)(?=.*b)\pL|.*c)/
++    11bb
++ 0: b
++
+ # End of testinput4
+-- 
+2.7.4
+
diff --git a/pcre2.spec b/pcre2.spec
index 6ed8f7a..0d62523 100644
--- a/pcre2.spec
+++ b/pcre2.spec
@@ -43,6 +43,9 @@ Patch6:     pcre2-10.22-Add-test-for-bug-already-fixed-by-the-refactoring.patch
 # Fix internal option documentation in pcre2pattern(3), in upstream after 10.22,
 # upstream bug #1875
 Patch7:     pcre2-10.22-Fix-documentation-error.patch
+# Fix optimization bugs for patterns starting with lookaheads,
+# in upstream after 10.22, upstream bug #1882
+Patch8:     pcre2-10.22-Fix-optimization-bugs-when-pattern-starts-with-looka.patch
 # New libtool to get rid of RPATH and to use distribution autotools
 BuildRequires:  autoconf
 BuildRequires:  automake
@@ -127,6 +130,7 @@ Utilities demonstrating PCRE2 capabilities like pcre2grep or pcre2test.
 %patch5 -p1
 %patch6 -p1
 %patch7 -p1
+%patch8 -p1
 # Because of multilib patch
 libtoolize --copy --force
 autoreconf -vif
@@ -230,6 +234,8 @@ make %{?_smp_mflags} check VERBOSE=yes
 - Fix miscopmilation of conditionals when a group name start with "R"
   (upstream bug #1873)
 - Fix internal option documentation in pcre2pattern(3) (upstream bug #1875)
+- Fix optimization bugs for patterns starting with lookaheads
+  (upstream bug #1882)
 
 * Mon Aug 29 2016 Petr Pisar <ppisar@redhat.com> - 10.22-2
 - Fix matching characters above 255 when a negative character type was used