395 lines
8.9 KiB
Diff
395 lines
8.9 KiB
Diff
From b70b8394e79360f92c87a169c1ba982a7e22d9d6 Mon Sep 17 00:00:00 2001
|
|
From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
|
|
Date: Wed, 31 Jan 2018 17:53:56 +0000
|
|
Subject: [PATCH] Fix auto-possessification bug at the end of a capturing group
|
|
that is called recursively.
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@912 6239d852-aaf2-0410-a92c-79f79f948069
|
|
|
|
Petr Písař: Ported to 10.31-RC1.
|
|
|
|
src/pcre2_auto_possess.c | 56 ++++++++++++++++++------
|
|
testdata/testinput1 | 30 +++++++++++++
|
|
testdata/testinput2 | 17 ++++++++
|
|
testdata/testoutput1 | 64 ++++++++++++++++++++++++++++
|
|
testdata/testoutput2 | 109 ++++++++++++++++++++++++++++++++++++++++++++++-
|
|
|
|
diff --git a/src/pcre2_auto_possess.c b/src/pcre2_auto_possess.c
|
|
index ad3543f..23275a2 100644
|
|
--- a/src/pcre2_auto_possess.c
|
|
+++ b/src/pcre2_auto_possess.c
|
|
@@ -558,47 +558,73 @@ for(;;)
|
|
continue;
|
|
}
|
|
|
|
+ /* At the end of a branch, skip to the end of the group. */
|
|
+
|
|
if (c == OP_ALT)
|
|
{
|
|
do code += GET(code, 1); while (*code == OP_ALT);
|
|
c = *code;
|
|
}
|
|
|
|
+ /* Inspect the next opcode. */
|
|
+
|
|
switch(c)
|
|
{
|
|
- case OP_END:
|
|
- case OP_KETRPOS:
|
|
- /* TRUE only in greedy case. The non-greedy case could be replaced by
|
|
- an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
|
|
- uses more memory, which we cannot get at this stage.) */
|
|
+ /* We can always possessify a greedy iterator at the end of the pattern,
|
|
+ which is reached after skipping over the final OP_KET. A non-greedy
|
|
+ iterator must never be possessified. */
|
|
|
|
+ case OP_END:
|
|
return base_list[1] != 0;
|
|
|
|
+ /* When an iterator is at the end of certain kinds of group we can inspect
|
|
+ what follows the group by skipping over the closing ket. Note that this
|
|
+ does not apply to OP_KETRMAX or OP_KETRMIN because what follows any given
|
|
+ iteration is variable (could be another iteration or could be the next
|
|
+ item). As these two opcodes are not listed in the next switch, they will
|
|
+ end up as the next code to inspect, and return FALSE by virtue of being
|
|
+ unsupported. */
|
|
+
|
|
case OP_KET:
|
|
- /* If the bracket is capturing, and referenced by an OP_RECURSE, or
|
|
- it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
|
|
- cannot be converted to a possessive form. */
|
|
+ case OP_KETRPOS:
|
|
+ /* The non-greedy case cannot be converted to a possessive form. */
|
|
|
|
if (base_list[1] == 0) return FALSE;
|
|
|
|
+ /* If the bracket is capturing it might be referenced by an OP_RECURSE
|
|
+ so its last iterator can never be possessified if the pattern contains
|
|
+ recursions. (This could be improved by keeping a list of group numbers that
|
|
+ are called by recursion.) */
|
|
+
|
|
switch(*(code - GET(code, 1)))
|
|
{
|
|
+ case OP_CBRA:
|
|
+ case OP_SCBRA:
|
|
+ case OP_CBRAPOS:
|
|
+ case OP_SCBRAPOS:
|
|
+ if (cb->had_recurse) return FALSE;
|
|
+ break;
|
|
+
|
|
+ /* Atomic sub-patterns and assertions can always auto-possessify their
|
|
+ last iterator. However, if the group was entered as a result of checking
|
|
+ a previous iterator, this is not possible. */
|
|
+
|
|
case OP_ASSERT:
|
|
case OP_ASSERT_NOT:
|
|
case OP_ASSERTBACK:
|
|
case OP_ASSERTBACK_NOT:
|
|
case OP_ONCE:
|
|
|
|
- /* Atomic sub-patterns and assertions can always auto-possessify their
|
|
- last iterator. However, if the group was entered as a result of checking
|
|
- a previous iterator, this is not possible. */
|
|
-
|
|
return !entered_a_group;
|
|
}
|
|
|
|
+ /* Skip over the bracket and inspect what comes next. */
|
|
+
|
|
code += PRIV(OP_lengths)[c];
|
|
continue;
|
|
|
|
+ /* Handle cases where the next item is a group. */
|
|
+
|
|
case OP_ONCE:
|
|
case OP_BRA:
|
|
case OP_CBRA:
|
|
@@ -637,11 +663,15 @@ for(;;)
|
|
code += PRIV(OP_lengths)[c];
|
|
continue;
|
|
|
|
+ /* The next opcode does not need special handling; fall through and use it
|
|
+ to see if the base can be possessified. */
|
|
+
|
|
default:
|
|
break;
|
|
}
|
|
|
|
- /* Check for a supported opcode, and load its properties. */
|
|
+ /* We now have the next appropriate opcode to compare with the base. Check
|
|
+ for a supported opcode, and load its properties. */
|
|
|
|
code = get_chr_property_list(code, utf, cb->fcc, list);
|
|
if (code == NULL) return FALSE; /* Unsupported */
|
|
diff --git a/testdata/testinput1 b/testdata/testinput1
|
|
index 756b4af..9a9c5fd 100644
|
|
--- a/testdata/testinput1
|
|
+++ b/testdata/testinput1
|
|
@@ -6159,4 +6159,34 @@ ef) x/x,mark
|
|
/((?<=((*ACCEPT))X)\1?Y(*ACCEPT))\1/
|
|
XYYZ
|
|
|
|
+/(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/
|
|
+ aa
|
|
+ a
|
|
+
|
|
+/^(a?)b(?1)a/
|
|
+ abaa
|
|
+ aba
|
|
+ baa
|
|
+ ba
|
|
+
|
|
+/^(a?)+b(?1)a/
|
|
+ abaa
|
|
+ aba
|
|
+ baa
|
|
+ ba
|
|
+
|
|
+/^(a?)++b(?1)a/
|
|
+ abaa
|
|
+ aba
|
|
+ baa
|
|
+ ba
|
|
+
|
|
+/^(a?)+b/
|
|
+ b
|
|
+ ab
|
|
+ aaab
|
|
+
|
|
+/(?=a+)a(a+)++b/
|
|
+ aab
|
|
+
|
|
# End of testinput1
|
|
diff --git a/testdata/testinput2 b/testdata/testinput2
|
|
index 2c2334c..5d3a80e 100644
|
|
--- a/testdata/testinput2
|
|
+++ b/testdata/testinput2
|
|
@@ -5412,4 +5412,21 @@ a)"xI
|
|
\= Expect no match
|
|
\na
|
|
|
|
+# These tests are matched in test 1 as they are Perl compatible. Here we are
|
|
+# looking at what does and does not get auto-possessified.
|
|
+
|
|
+/(?(DEFINE)(?<optional_a>a?))^(?&optional_a)a$/B
|
|
+
|
|
+/(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/B
|
|
+
|
|
+/^(a?)b(?1)a/B
|
|
+
|
|
+/^(a?)+b(?1)a/B
|
|
+
|
|
+/^(a?)++b(?1)a/B
|
|
+
|
|
+/^(a?)+b/B
|
|
+
|
|
+/(?=a+)a(a+)++b/B
|
|
+
|
|
# End of testinput2
|
|
diff --git a/testdata/testoutput1 b/testdata/testoutput1
|
|
index 7cf647e..9c55be9 100644
|
|
--- a/testdata/testoutput1
|
|
+++ b/testdata/testoutput1
|
|
@@ -9758,4 +9758,68 @@ No match
|
|
1: Y
|
|
2:
|
|
|
|
+/(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/
|
|
+ aa
|
|
+ 0: aa
|
|
+ a
|
|
+ 0: a
|
|
+
|
|
+/^(a?)b(?1)a/
|
|
+ abaa
|
|
+ 0: abaa
|
|
+ 1: a
|
|
+ aba
|
|
+ 0: aba
|
|
+ 1: a
|
|
+ baa
|
|
+ 0: baa
|
|
+ 1:
|
|
+ ba
|
|
+ 0: ba
|
|
+ 1:
|
|
+
|
|
+/^(a?)+b(?1)a/
|
|
+ abaa
|
|
+ 0: abaa
|
|
+ 1:
|
|
+ aba
|
|
+ 0: aba
|
|
+ 1:
|
|
+ baa
|
|
+ 0: baa
|
|
+ 1:
|
|
+ ba
|
|
+ 0: ba
|
|
+ 1:
|
|
+
|
|
+/^(a?)++b(?1)a/
|
|
+ abaa
|
|
+ 0: abaa
|
|
+ 1:
|
|
+ aba
|
|
+ 0: aba
|
|
+ 1:
|
|
+ baa
|
|
+ 0: baa
|
|
+ 1:
|
|
+ ba
|
|
+ 0: ba
|
|
+ 1:
|
|
+
|
|
+/^(a?)+b/
|
|
+ b
|
|
+ 0: b
|
|
+ 1:
|
|
+ ab
|
|
+ 0: ab
|
|
+ 1:
|
|
+ aaab
|
|
+ 0: aaab
|
|
+ 1:
|
|
+
|
|
+/(?=a+)a(a+)++b/
|
|
+ aab
|
|
+ 0: aab
|
|
+ 1: a
|
|
+
|
|
# End of testinput1
|
|
diff --git a/testdata/testoutput2 b/testdata/testoutput2
|
|
index f783320..fcaac8f 100644
|
|
--- a/testdata/testoutput2
|
|
+++ b/testdata/testoutput2
|
|
@@ -12701,7 +12701,7 @@ Subject length lower bound = 5
|
|
Ket
|
|
a
|
|
CBraPos 1
|
|
- a++
|
|
+ a+
|
|
KetRpos
|
|
a
|
|
Ket
|
|
@@ -16468,6 +16468,113 @@ No match
|
|
\na
|
|
No match
|
|
|
|
+# These tests are matched in test 1 as they are Perl compatible. Here we are
|
|
+# looking at what does and does not get auto-possessified.
|
|
+
|
|
+/(?(DEFINE)(?<optional_a>a?))^(?&optional_a)a$/B
|
|
+------------------------------------------------------------------
|
|
+ Bra
|
|
+ Cond
|
|
+ Cond false
|
|
+ CBra 1
|
|
+ a?
|
|
+ Ket
|
|
+ Ket
|
|
+ ^
|
|
+ Recurse
|
|
+ a
|
|
+ $
|
|
+ Ket
|
|
+ End
|
|
+------------------------------------------------------------------
|
|
+
|
|
+/(?(DEFINE)(?<optional_a>a?)X)^(?&optional_a)a$/B
|
|
+------------------------------------------------------------------
|
|
+ Bra
|
|
+ Cond
|
|
+ Cond false
|
|
+ CBra 1
|
|
+ a?
|
|
+ Ket
|
|
+ X
|
|
+ Ket
|
|
+ ^
|
|
+ Recurse
|
|
+ a
|
|
+ $
|
|
+ Ket
|
|
+ End
|
|
+------------------------------------------------------------------
|
|
+
|
|
+/^(a?)b(?1)a/B
|
|
+------------------------------------------------------------------
|
|
+ Bra
|
|
+ ^
|
|
+ CBra 1
|
|
+ a?
|
|
+ Ket
|
|
+ b
|
|
+ Recurse
|
|
+ a
|
|
+ Ket
|
|
+ End
|
|
+------------------------------------------------------------------
|
|
+
|
|
+/^(a?)+b(?1)a/B
|
|
+------------------------------------------------------------------
|
|
+ Bra
|
|
+ ^
|
|
+ SCBra 1
|
|
+ a?
|
|
+ KetRmax
|
|
+ b
|
|
+ Recurse
|
|
+ a
|
|
+ Ket
|
|
+ End
|
|
+------------------------------------------------------------------
|
|
+
|
|
+/^(a?)++b(?1)a/B
|
|
+------------------------------------------------------------------
|
|
+ Bra
|
|
+ ^
|
|
+ SCBraPos 1
|
|
+ a?
|
|
+ KetRpos
|
|
+ b
|
|
+ Recurse
|
|
+ a
|
|
+ Ket
|
|
+ End
|
|
+------------------------------------------------------------------
|
|
+
|
|
+/^(a?)+b/B
|
|
+------------------------------------------------------------------
|
|
+ Bra
|
|
+ ^
|
|
+ SCBra 1
|
|
+ a?
|
|
+ KetRmax
|
|
+ b
|
|
+ Ket
|
|
+ End
|
|
+------------------------------------------------------------------
|
|
+
|
|
+/(?=a+)a(a+)++b/B
|
|
+------------------------------------------------------------------
|
|
+ Bra
|
|
+ Assert
|
|
+ a++
|
|
+ Ket
|
|
+ a
|
|
+ CBraPos 1
|
|
+ a++
|
|
+ KetRpos
|
|
+ b
|
|
+ Ket
|
|
+ End
|
|
+------------------------------------------------------------------
|
|
+
|
|
# End of testinput2
|
|
Error -65: PCRE2_ERROR_BADDATA (unknown error number)
|
|
Error -62: bad serialized data
|
|
--
|
|
2.13.6
|
|
|