pcre2/pcre2-10.31-RC1-Fix-auto-possessification-bug-at-the-end-of-a-captur.patch

From b70b8394e79360f92c87a169c1ba982a7e22d9d6 Mon Sep 17 00:00:00 2001
From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
Date: Wed, 31 Jan 2018 17:53:56 +0000
Subject: [PATCH] Fix auto-possessification bug at the end of a capturing group
 that is called recursively.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@912 6239d852-aaf2-0410-a92c-79f79f948069

Petr Písař: Ported to 10.31-RC1.

 src/pcre2_auto_possess.c |  56 ++++++++++++++++++------
 testdata/testinput1      |  30 +++++++++++++
 testdata/testinput2      |  17 ++++++++
 testdata/testoutput1     |  64 ++++++++++++++++++++++++++++
 testdata/testoutput2     | 109 ++++++++++++++++++++++++++++++++++++++++++++++-

diff --git a/src/pcre2_auto_possess.c b/src/pcre2_auto_possess.c
index ad3543f..23275a2 100644
--- a/src/pcre2_auto_possess.c
+++ b/src/pcre2_auto_possess.c
@@ -558,47 +558,73 @@ for(;;)
     continue;
     }

+  /* At the end of a branch, skip to the end of the group. */
+
   if (c == OP_ALT)
     {
     do code += GET(code, 1); while (*code == OP_ALT);
     c = *code;
     }

+  /* Inspect the next opcode. */
+
   switch(c)
     {
-    case OP_END:
-    case OP_KETRPOS:
-    /* TRUE only in greedy case. The non-greedy case could be replaced by
-    an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
-    uses more memory, which we cannot get at this stage.) */
+    /* We can always possessify a greedy iterator at the end of the pattern,
+    which is reached after skipping over the final OP_KET. A non-greedy
+    iterator must never be possessified. */

+    case OP_END:
     return base_list[1] != 0;

+    /* When an iterator is at the end of certain kinds of group we can inspect
+    what follows the group by skipping over the closing ket. Note that this
+    does not apply to OP_KETRMAX or OP_KETRMIN because what follows any given
+    iteration is variable (could be another iteration or could be the next
+    item). As these two opcodes are not listed in the next switch, they will
+    end up as the next code to inspect, and return FALSE by virtue of being
+    unsupported. */
+
     case OP_KET:
-    /* If the bracket is capturing, and referenced by an OP_RECURSE, or
-    it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
-    cannot be converted to a possessive form. */
+    case OP_KETRPOS:
+    /* The non-greedy case cannot be converted to a possessive form. */

     if (base_list[1] == 0) return FALSE;

+    /* If the bracket is capturing it might be referenced by an OP_RECURSE
+    so its last iterator can never be possessified if the pattern contains
+    recursions. (This could be improved by keeping a list of group numbers that
+    are called by recursion.) */
+
     switch(*(code - GET(code, 1)))
       {
+      case OP_CBRA:
+      case OP_SCBRA:
+      case OP_CBRAPOS:
+      case OP_SCBRAPOS:
+      if (cb->had_recurse) return FALSE;
+      break;
+
+      /* Atomic sub-patterns and assertions can always auto-possessify their
+      last iterator. However, if the group was entered as a result of checking
+      a previous iterator, this is not possible. */
+
       case OP_ASSERT:
       case OP_ASSERT_NOT:
       case OP_ASSERTBACK:
       case OP_ASSERTBACK_NOT:
       case OP_ONCE:

-      /* Atomic sub-patterns and assertions can always auto-possessify their
-      last iterator. However, if the group was entered as a result of checking
-      a previous iterator, this is not possible. */
-
       return !entered_a_group;
       }

+    /* Skip over the bracket and inspect what comes next. */
+
     code += PRIV(OP_lengths)[c];
     continue;

+    /* Handle cases where the next item is a group. */
+
     case OP_ONCE:
     case OP_BRA:
     case OP_CBRA:
@@ -637,11 +663,15 @@ for(;;)
     code += PRIV(OP_lengths)[c];
     continue;

+    /* The next opcode does not need special handling; fall through and use it
+    to see if the base can be possessified. */
+
     default:
     break;
     }

-  /* Check for a supported opcode, and load its properties. */
+  /* We now have the next appropriate opcode to compare with the base. Check
+  for a supported opcode, and load its properties. */

   code = get_chr_property_list(code, utf, cb->fcc, list);
   if (code == NULL) return FALSE;    /* Unsupported */
diff --git a/testdata/testinput1 b/testdata/testinput1
index 756b4af..9a9c5fd 100644
--- a/testdata/testinput1
+++ b/testdata/testinput1
@@ -6159,4 +6159,34 @@ ef) x/x,mark
 /((?<=((*ACCEPT))X)\1?Y(*ACCEPT))\1/
     XYYZ

+/(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/
+    aa
+    a
+
+/^(a?)b(?1)a/
+    abaa
+    aba
+    baa
+    ba
+
+/^(a?)+b(?1)a/
+    abaa
+    aba
+    baa
+    ba
+
+/^(a?)++b(?1)a/
+    abaa
+    aba
+    baa
+    ba
+
+/^(a?)+b/
+    b
+    ab
+    aaab
+
+/(?=a+)a(a+)++b/
+    aab
+
 # End of testinput1
diff --git a/testdata/testinput2 b/testdata/testinput2
index 2c2334c..5d3a80e 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -5412,4 +5412,21 @@ a)"xI
 \= Expect no match
     \na

+# These tests are matched in test 1 as they are Perl compatible. Here we are
+# looking at what does and does not get auto-possessified.
+
+/(?(DEFINE)(?<optional_a>a?))^(?&optional_a)a$/B
+
+/(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/B
+
+/^(a?)b(?1)a/B
+
+/^(a?)+b(?1)a/B
+
+/^(a?)++b(?1)a/B
+
+/^(a?)+b/B
+
+/(?=a+)a(a+)++b/B
+
 # End of testinput2
diff --git a/testdata/testoutput1 b/testdata/testoutput1
index 7cf647e..9c55be9 100644
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@@ -9758,4 +9758,68 @@ No match
  1: Y
  2:

+/(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/
+    aa
+ 0: aa
+    a
+ 0: a
+
+/^(a?)b(?1)a/
+    abaa
+ 0: abaa
+ 1: a
+    aba
+ 0: aba
+ 1: a
+    baa
+ 0: baa
+ 1:
+    ba
+ 0: ba
+ 1:
+
+/^(a?)+b(?1)a/
+    abaa
+ 0: abaa
+ 1:
+    aba
+ 0: aba
+ 1:
+    baa
+ 0: baa
+ 1:
+    ba
+ 0: ba
+ 1:
+
+/^(a?)++b(?1)a/
+    abaa
+ 0: abaa
+ 1:
+    aba
+ 0: aba
+ 1:
+    baa
+ 0: baa
+ 1:
+    ba
+ 0: ba
+ 1:
+
+/^(a?)+b/
+    b
+ 0: b
+ 1:
+    ab
+ 0: ab
+ 1:
+    aaab
+ 0: aaab
+ 1:
+
+/(?=a+)a(a+)++b/
+    aab
+ 0: aab
+ 1: a
+
 # End of testinput1
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index f783320..fcaac8f 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -12701,7 +12701,7 @@ Subject length lower bound = 5
         Ket
         a
         CBraPos 1
-        a++
+        a+
         KetRpos
         a
         Ket
@@ -16468,6 +16468,113 @@ No match
     \na
 No match

+# These tests are matched in test 1 as they are Perl compatible. Here we are
+# looking at what does and does not get auto-possessified.
+
+/(?(DEFINE)(?<optional_a>a?))^(?&optional_a)a$/B
+------------------------------------------------------------------
+        Bra
+        Cond
+        Cond false
+        CBra 1
+        a?
+        Ket
+        Ket
+        ^
+        Recurse
+        a
+        $
+        Ket
+        End
+------------------------------------------------------------------
+
+/(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/B
+------------------------------------------------------------------
+        Bra
+        Cond
+        Cond false
+        CBra 1
+        a?
+        Ket
+        X
+        Ket
+        ^
+        Recurse
+        a
+        $
+        Ket
+        End
+------------------------------------------------------------------
+
+/^(a?)b(?1)a/B
+------------------------------------------------------------------
+        Bra
+        ^
+        CBra 1
+        a?
+        Ket
+        b
+        Recurse
+        a
+        Ket
+        End
+------------------------------------------------------------------
+
+/^(a?)+b(?1)a/B
+------------------------------------------------------------------
+        Bra
+        ^
+        SCBra 1
+        a?
+        KetRmax
+        b
+        Recurse
+        a
+        Ket
+        End
+------------------------------------------------------------------
+
+/^(a?)++b(?1)a/B
+------------------------------------------------------------------
+        Bra
+        ^
+        SCBraPos 1
+        a?
+        KetRpos
+        b
+        Recurse
+        a
+        Ket
+        End
+------------------------------------------------------------------
+
+/^(a?)+b/B
+------------------------------------------------------------------
+        Bra
+        ^
+        SCBra 1
+        a?
+        KetRmax
+        b
+        Ket
+        End
+------------------------------------------------------------------
+
+/(?=a+)a(a+)++b/B
+------------------------------------------------------------------
+        Bra
+        Assert
+        a++
+        Ket
+        a
+        CBraPos 1
+        a++
+        KetRpos
+        b
+        Ket
+        End
+------------------------------------------------------------------
+
 # End of testinput2
 Error -65: PCRE2_ERROR_BADDATA (unknown error number)
 Error -62: bad serialized data
--
2.13.6