From 7729d10594572b5e5a3ebfa89064cc176ba50c7e Mon Sep 17 00:00:00 2001
From: ph10
Date: Mon, 2 Jul 2018 10:54:03 +0000
Subject: [PATCH] Fix global search/replace in pcre2test and pcre2_substitute()
when the pattern matches an empty string, but never at the starting offset.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@955 6239d852-aaf2-0410-a92c-79f79f948069
Petr Písař: Ported to 10.31.
Signed-off-by: Petr Písař
---
RunTest | 2 +-
doc/html/pcre2api.html | 5 +-
doc/html/pcre2pattern.html | 5 +-
doc/pcre2.txt | 175 ++++++++++++++++++++++++---------------------
doc/pcre2api.3 | 5 +-
src/pcre2.h.in | 3 +-
src/pcre2_error.c | 4 +-
src/pcre2_substitute.c | 41 +++++++++--
src/pcre2test.c | 77 ++++++++++++--------
testdata/testinput1 | 3 +
testdata/testinput2 | 3 +
testdata/testoutput1 | 9 +++
testdata/testoutput2 | 6 +-
13 files changed, 214 insertions(+), 124 deletions(-)
diff --git a/RunTest b/RunTest
index bc912da..f20f194 100755
--- a/RunTest
+++ b/RunTest
@@ -500,7 +500,7 @@ for bmode in "$test8" "$test16" "$test32"; do
for opt in "" $jitopt; do
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $setstack $bmode $opt $testdata/testinput2 testtry
if [ $? = 0 ] ; then
- $sim $valgrind ${opt:+$vjs} ./pcre2test -q $bmode $opt -error -65,-62,-2,-1,0,100,101,191,200 >>testtry
+ $sim $valgrind ${opt:+$vjs} ./pcre2test -q $bmode $opt -error -70,-62,-2,-1,0,100,101,191,200 >>testtry
checkresult $? 2 "$opt"
fi
done
diff --git a/doc/html/pcre2api.html b/doc/html/pcre2api.html
index ba3b2ca..daa32a9 100644
--- a/doc/html/pcre2api.html
+++ b/doc/html/pcre2api.html
@@ -3108,7 +3108,10 @@ string in outputbuffer, replacing the part that was matched with the
replacement string, whose length is supplied in rlength. This can
be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in
which a \K item in a lookahead in the pattern causes the match to end before
-it starts are not supported, and give rise to an error return.
+it starts are not supported, and give rise to an error return. For global
+replacements, matches in which \K in a lookbehind causes the match to start
+earlier than the point that was reached in the previous iteration are also not
+supported.
The first seven arguments of pcre2_substitute() are the same as for
diff --git a/doc/html/pcre2pattern.html b/doc/html/pcre2pattern.html
index c495cba..bc07e8b 100644
--- a/doc/html/pcre2pattern.html
+++ b/doc/html/pcre2pattern.html
@@ -1082,8 +1082,9 @@ sequences but the characters that they represent.)
Resetting the match start
-The escape sequence \K causes any previously matched characters not to be
-included in the final matched sequence. For example, the pattern:
+In normal use, the escape sequence \K causes any previously matched characters
+not to be included in the final matched sequence that is returned. For example,
+the pattern:
foo\Kbar
diff --git a/doc/pcre2.txt b/doc/pcre2.txt
index 79d94e3..a82f857 100644
--- a/doc/pcre2.txt
+++ b/doc/pcre2.txt
@@ -3014,75 +3014,78 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
replacement string, whose length is supplied in rlength. This can be
given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in
which a \K item in a lookahead in the pattern causes the match to end
- before it starts are not supported, and give rise to an error return.
+ before it starts are not supported, and give rise to an error return.
+ For global replacements, matches in which \K in a lookbehind causes the
+ match to start earlier than the point that was reached in the previous
+ iteration are also not supported.
- The first seven arguments of pcre2_substitute() are the same as for
+ The first seven arguments of pcre2_substitute() are the same as for
pcre2_match(), except that the partial matching options are not permit-
- ted, and match_data may be passed as NULL, in which case a match data
- block is obtained and freed within this function, using memory manage-
- ment functions from the match context, if provided, or else those that
+ ted, and match_data may be passed as NULL, in which case a match data
+ block is obtained and freed within this function, using memory manage-
+ ment functions from the match context, if provided, or else those that
were used to allocate memory for the compiled code.
- The outlengthptr argument must point to a variable that contains the
- length, in code units, of the output buffer. If the function is suc-
- cessful, the value is updated to contain the length of the new string,
+ The outlengthptr argument must point to a variable that contains the
+ length, in code units, of the output buffer. If the function is suc-
+ cessful, the value is updated to contain the length of the new string,
excluding the trailing zero that is automatically added.
- If the function is not successful, the value set via outlengthptr
- depends on the type of error. For syntax errors in the replacement
- string, the value is the offset in the replacement string where the
- error was detected. For other errors, the value is PCRE2_UNSET by
- default. This includes the case of the output buffer being too small,
- unless PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set (see below), in which
- case the value is the minimum length needed, including space for the
- trailing zero. Note that in order to compute the required length,
- pcre2_substitute() has to simulate all the matching and copying,
+ If the function is not successful, the value set via outlengthptr
+ depends on the type of error. For syntax errors in the replacement
+ string, the value is the offset in the replacement string where the
+ error was detected. For other errors, the value is PCRE2_UNSET by
+ default. This includes the case of the output buffer being too small,
+ unless PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set (see below), in which
+ case the value is the minimum length needed, including space for the
+ trailing zero. Note that in order to compute the required length,
+ pcre2_substitute() has to simulate all the matching and copying,
instead of giving an error return as soon as the buffer overflows. Note
also that the length is in code units, not bytes.
- In the replacement string, which is interpreted as a UTF string in UTF
- mode, and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK
+ In the replacement string, which is interpreted as a UTF string in UTF
+ mode, and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK
option is set, a dollar character is an escape character that can spec-
- ify the insertion of characters from capturing groups or (*MARK),
- (*PRUNE), or (*THEN) items in the pattern. The following forms are
+ ify the insertion of characters from capturing groups or (*MARK),
+ (*PRUNE), or (*THEN) items in the pattern. The following forms are
always recognized:
$$ insert a dollar character
$ or ${} insert the contents of group
$*MARK or ${*MARK} insert a (*MARK), (*PRUNE), or (*THEN) name
- Either a group number or a group name can be given for . Curly
- brackets are required only if the following character would be inter-
+ Either a group number or a group name can be given for . Curly
+ brackets are required only if the following character would be inter-
preted as part of the number or name. The number may be zero to include
- the entire matched string. For example, if the pattern a(b)c is
- matched with "=abc=" and the replacement string "+$1$0$1+", the result
+ the entire matched string. For example, if the pattern a(b)c is
+ matched with "=abc=" and the replacement string "+$1$0$1+", the result
is "=+babcb+=".
$*MARK inserts the name from the last encountered (*MARK), (*PRUNE), or
- (*THEN) on the matching path that has a name. (*MARK) must always
- include a name, but (*PRUNE) and (*THEN) need not. For example, in the
- case of (*MARK:A)(*PRUNE) the name inserted is "A", but for
- (*MARK:A)(*PRUNE:B) the relevant name is "B". This facility can be
- used to perform simple simultaneous substitutions, as this pcre2test
+ (*THEN) on the matching path that has a name. (*MARK) must always
+ include a name, but (*PRUNE) and (*THEN) need not. For example, in the
+ case of (*MARK:A)(*PRUNE) the name inserted is "A", but for
+ (*MARK:A)(*PRUNE:B) the relevant name is "B". This facility can be
+ used to perform simple simultaneous substitutions, as this pcre2test
example shows:
/(*MARK:pear)apple|(*MARK:orange)lemon/g,replace=${*MARK}
apple lemon
2: pear orange
- As well as the usual options for pcre2_match(), a number of additional
+ As well as the usual options for pcre2_match(), a number of additional
options can be set in the options argument of pcre2_substitute().
PCRE2_SUBSTITUTE_GLOBAL causes the function to iterate over the subject
- string, replacing every matching substring. If this option is not set,
- only the first matching substring is replaced. The search for matches
- takes place in the original subject string (that is, previous replace-
- ments do not affect it). Iteration is implemented by advancing the
- startoffset value for each search, which is always passed the entire
+ string, replacing every matching substring. If this option is not set,
+ only the first matching substring is replaced. The search for matches
+ takes place in the original subject string (that is, previous replace-
+ ments do not affect it). Iteration is implemented by advancing the
+ startoffset value for each search, which is always passed the entire
subject string. If an offset limit is set in the match context, search-
ing stops when that limit is reached.
- You can restrict the effect of a global substitution to a portion of
+ You can restrict the effect of a global substitution to a portion of
the subject string by setting either or both of startoffset and an off-
set limit. Here is a pcre2test example:
@@ -3090,87 +3093,87 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
ABC ABC ABC ABC\=offset=3,offset_limit=12
2: ABC A!C A!C ABC
- When continuing with global substitutions after matching a substring
+ When continuing with global substitutions after matching a substring
with zero length, an attempt to find a non-empty match at the same off-
set is performed. If this is not successful, the offset is advanced by
one character except when CRLF is a valid newline sequence and the next
- two characters are CR, LF. In this case, the offset is advanced by two
+ two characters are CR, LF. In this case, the offset is advanced by two
characters.
- PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output
+ PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output
buffer is too small. The default action is to return PCRE2_ERROR_NOMEM-
- ORY immediately. If this option is set, however, pcre2_substitute()
+ ORY immediately. If this option is set, however, pcre2_substitute()
continues to go through the motions of matching and substituting (with-
- out, of course, writing anything) in order to compute the size of buf-
- fer that is needed. This value is passed back via the outlengthptr
- variable, with the result of the function still being
+ out, of course, writing anything) in order to compute the size of buf-
+ fer that is needed. This value is passed back via the outlengthptr
+ variable, with the result of the function still being
PCRE2_ERROR_NOMEMORY.
- Passing a buffer size of zero is a permitted way of finding out how
- much memory is needed for given substitution. However, this does mean
+ Passing a buffer size of zero is a permitted way of finding out how
+ much memory is needed for given substitution. However, this does mean
that the entire operation is carried out twice. Depending on the appli-
- cation, it may be more efficient to allocate a large buffer and free
- the excess afterwards, instead of using PCRE2_SUBSTITUTE_OVER-
+ cation, it may be more efficient to allocate a large buffer and free
+ the excess afterwards, instead of using PCRE2_SUBSTITUTE_OVER-
FLOW_LENGTH.
- PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capturing groups
- that do not appear in the pattern to be treated as unset groups. This
- option should be used with care, because it means that a typo in a
- group name or number no longer causes the PCRE2_ERROR_NOSUBSTRING
+ PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capturing groups
+ that do not appear in the pattern to be treated as unset groups. This
+ option should be used with care, because it means that a typo in a
+ group name or number no longer causes the PCRE2_ERROR_NOSUBSTRING
error.
- PCRE2_SUBSTITUTE_UNSET_EMPTY causes unset capturing groups (including
+ PCRE2_SUBSTITUTE_UNSET_EMPTY causes unset capturing groups (including
unknown groups when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) to be
- treated as empty strings when inserted as described above. If this
- option is not set, an attempt to insert an unset group causes the
- PCRE2_ERROR_UNSET error. This option does not influence the extended
+ treated as empty strings when inserted as described above. If this
+ option is not set, an attempt to insert an unset group causes the
+ PCRE2_ERROR_UNSET error. This option does not influence the extended
substitution syntax described below.
- PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to the
- replacement string. Without this option, only the dollar character is
- special, and only the group insertion forms listed above are valid.
+ PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to the
+ replacement string. Without this option, only the dollar character is
+ special, and only the group insertion forms listed above are valid.
When PCRE2_SUBSTITUTE_EXTENDED is set, two things change:
- Firstly, backslash in a replacement string is interpreted as an escape
+ Firstly, backslash in a replacement string is interpreted as an escape
character. The usual forms such as \n or \x{ddd} can be used to specify
- particular character codes, and backslash followed by any non-alphanu-
- meric character quotes that character. Extended quoting can be coded
+ particular character codes, and backslash followed by any non-alphanu-
+ meric character quotes that character. Extended quoting can be coded
using \Q...\E, exactly as in pattern strings.
- There are also four escape sequences for forcing the case of inserted
- letters. The insertion mechanism has three states: no case forcing,
+ There are also four escape sequences for forcing the case of inserted
+ letters. The insertion mechanism has three states: no case forcing,
force upper case, and force lower case. The escape sequences change the
current state: \U and \L change to upper or lower case forcing, respec-
- tively, and \E (when not terminating a \Q quoted sequence) reverts to
- no case forcing. The sequences \u and \l force the next character (if
- it is a letter) to upper or lower case, respectively, and then the
+ tively, and \E (when not terminating a \Q quoted sequence) reverts to
+ no case forcing. The sequences \u and \l force the next character (if
+ it is a letter) to upper or lower case, respectively, and then the
state automatically reverts to no case forcing. Case forcing applies to
all inserted characters, including those from captured groups and let-
ters within \Q...\E quoted sequences.
Note that case forcing sequences such as \U...\E do not nest. For exam-
- ple, the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final
+ ple, the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final
\E has no effect.
- The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more
- flexibility to group substitution. The syntax is similar to that used
+ The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more
+ flexibility to group substitution. The syntax is similar to that used
by Bash:
${:-}
${:+:}
- As before, may be a group number or a name. The first form speci-
- fies a default value. If group is set, its value is inserted; if
- not, is expanded and the result inserted. The second form
- specifies strings that are expanded and inserted when group is set
- or unset, respectively. The first form is just a convenient shorthand
+ As before, may be a group number or a name. The first form speci-
+ fies a default value. If group is set, its value is inserted; if
+ not, is expanded and the result inserted. The second form
+ specifies strings that are expanded and inserted when group is set
+ or unset, respectively. The first form is just a convenient shorthand
for
${:+${}:}
- Backslash can be used to escape colons and closing curly brackets in
- the replacement strings. A change of the case forcing state within a
- replacement string remains in force afterwards, as shown in this
+ Backslash can be used to escape colons and closing curly brackets in
+ the replacement strings. A change of the case forcing state within a
+ replacement string remains in force afterwards, as shown in this
pcre2test example:
/(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo
@@ -6614,8 +6617,9 @@ BACKSLASH
Resetting the match start
- The escape sequence \K causes any previously matched characters not to
- be included in the final matched sequence. For example, the pattern:
+ In normal use, the escape sequence \K causes any previously matched
+ characters not to be included in the final matched sequence that is
+ returned. For example, the pattern:
foo\Kbar
@@ -6634,7 +6638,16 @@ BACKSLASH
defined". In PCRE2, \K is acted upon when it occurs inside positive
assertions, but is ignored in negative assertions. Note that when a
pattern such as (?=ab\K) matches, the reported start of the match can
- be greater than the end of the match.
+ be greater than the end of the match. Using \K in a lookbehind asser-
+ tion at the start of a pattern can also lead to odd effects. For exam-
+ ple, consider this pattern:
+
+ (?<=\Kfoo)bar
+
+ If the subject is "foobar", a call to pcre2_match() with a starting
+ offset of 3 succeeds and reports the matching string as "foobar", that
+ is, the start of the reported match is earlier than where the match
+ started.
Simple assertions
diff --git a/doc/pcre2api.3 b/doc/pcre2api.3
index 786b314..57b6d31 100644
--- a/doc/pcre2api.3
+++ b/doc/pcre2api.3
@@ -3122,7 +3122,10 @@ string in \fIoutputbuffer\fP, replacing the part that was matched with the
\fIreplacement\fP string, whose length is supplied in \fBrlength\fP. This can
be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in
which a \eK item in a lookahead in the pattern causes the match to end before
-it starts are not supported, and give rise to an error return.
+it starts are not supported, and give rise to an error return. For global
+replacements, matches in which \eK in a lookbehind causes the match to start
+earlier than the point that was reached in the previous iteration are also not
+supported.
.P
The first seven arguments of \fBpcre2_substitute()\fP are the same as for
\fBpcre2_match()\fP, except that the partial matching options are not
diff --git a/src/pcre2.h.in b/src/pcre2.h.in
index a3a3fa6..0bc8cca 100644
--- a/src/pcre2.h.in
+++ b/src/pcre2.h.in
@@ -5,7 +5,7 @@
/* This is the public header file for the PCRE library, second API, to be
#included by applications that call PCRE2 functions.
- Copyright (c) 2016-2017 University of Cambridge
+ Copyright (c) 2016-2018 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -387,6 +387,7 @@ released, the numbers must not be changed. */
#define PCRE2_ERROR_BADSERIALIZEDDATA (-62)
#define PCRE2_ERROR_HEAPLIMIT (-63)
#define PCRE2_ERROR_CONVERT_SYNTAX (-64)
+#define PCRE2_ERROR_INTERNAL_DUPMATCH (-65)
/* Request types for pcre2_pattern_info() */
diff --git a/src/pcre2_error.c b/src/pcre2_error.c
index d98cae9..dce1efb 100644
--- a/src/pcre2_error.c
+++ b/src/pcre2_error.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
- New API code Copyright (c) 2016-2017 University of Cambridge
+ New API code Copyright (c) 2016-2018 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -260,6 +260,8 @@ static const unsigned char match_error_texts[] =
"bad serialized data\0"
"heap limit exceeded\0"
"invalid syntax\0"
+ /* 65 */
+ "internal error - duplicate substitution match\0"
;
diff --git a/src/pcre2_substitute.c b/src/pcre2_substitute.c
index 8da951f..582a42d 100644
--- a/src/pcre2_substitute.c
+++ b/src/pcre2_substitute.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
- New API code Copyright (c) 2016 University of Cambridge
+ New API code Copyright (c) 2016-2018 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -238,10 +238,12 @@ PCRE2_SPTR repend;
PCRE2_SIZE extra_needed = 0;
PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength;
PCRE2_SIZE *ovector;
+PCRE2_SIZE ovecsave[3];
buff_offset = 0;
lengthleft = buff_length = *blength;
*blength = PCRE2_UNSET;
+ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET;
/* Partial matching is not valid. */
@@ -368,6 +370,26 @@ do
rc = PCRE2_ERROR_BADSUBSPATTERN;
goto EXIT;
}
+
+ /* Check for the same match as previous. This is legitimate after matching an
+ empty string that starts after the initial match offset. We have tried again
+ at the match point in case the pattern is one like /(?<=\G.)/ which can never
+ match at its starting point, so running the match achieves the bumpalong. If
+ we do get the same (null) match at the original match point, it isn't such a
+ pattern, so we now do the empty string magic. In all other cases, a repeat
+ match should never occur. */
+
+ if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1])
+ {
+ if (ovector[0] == ovector[1] && ovecsave[2] != start_offset)
+ {
+ goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
+ ovecsave[2] = start_offset;
+ continue; /* Back to the top of the loop */
+ }
+ rc = PCRE2_ERROR_INTERNAL_DUPMATCH;
+ goto EXIT;
+ }
/* Count substitutions with a paranoid check for integer overflow; surely no
real call to this function would ever hit this! */
@@ -799,13 +821,18 @@ do
} /* End handling a literal code unit */
} /* End of loop for scanning the replacement. */
- /* The replacement has been copied to the output. Update the start offset to
- point to the rest of the subject string. If we matched an empty string,
- do the magic for global matches. */
-
- start_offset = ovector[1];
- goptions = (ovector[0] != ovector[1])? 0 :
+ /* The replacement has been copied to the output. Save the details of this
+ match. See above for how this data is used. If we matched an empty string, do
+ the magic for global matches. Finally, update the start offset to point to
+ the rest of the subject string. */
+
+ ovecsave[0] = ovector[0];
+ ovecsave[1] = ovector[1];
+ ovecsave[2] = start_offset;
+
+ goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 :
PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART;
+ start_offset = ovector[1];
} while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0); /* Repeat "do" loop */
/* Copy the rest of the subject. */
diff --git a/src/pcre2test.c b/src/pcre2test.c
index ad3db2c..d83aa43 100644
--- a/src/pcre2test.c
+++ b/src/pcre2test.c
@@ -6283,6 +6283,7 @@ size_t needlen;
void *use_dat_context;
BOOL utf;
BOOL subject_literal;
+PCRE2_SIZE ovecsave[3];
#ifdef SUPPORT_PCRE2_8
uint8_t *q8 = NULL;
@@ -6929,6 +6930,9 @@ if (dat_datctl.replacement[0] != 0)
if (timeitm)
fprintf(outfile, "** Timing is not supported with replace: ignored\n");
+
+ if ((dat_datctl.control & CTL_ALTGLOBAL) != 0)
+ fprintf(outfile, "** Altglobal is not supported with replace: ignored\n");
xoptions = (((dat_datctl.control & CTL_GLOBAL) == 0)? 0 :
PCRE2_SUBSTITUTE_GLOBAL) |
@@ -7048,35 +7052,24 @@ if (dat_datctl.replacement[0] != 0)
}
fprintf(outfile, "\n");
+ show_memory = FALSE;
+ return PR_OK;
} /* End of substitution handling */
/* When a replacement string is not provided, run a loop for global matching
-with one of the basic matching functions. */
+with one of the basic matching functions. For altglobal (or first time round
+the loop), set an "unset" value for the previous match info. */
+
+ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET;
-else for (gmatched = 0;; gmatched++)
+for (gmatched = 0;; gmatched++)
{
PCRE2_SIZE j;
int capcount;
PCRE2_SIZE *ovector;
- PCRE2_SIZE ovecsave[2];
ovector = FLD(match_data, ovector);
- /* After the first time round a global loop, for a normal global (/g)
- iteration, save the current ovector[0,1] so that we can check that they do
- change each time. Otherwise a matching bug that returns the same string
- causes an infinite loop. It has happened! */
-
- if (gmatched > 0 && (dat_datctl.control & CTL_GLOBAL) != 0)
- {
- ovecsave[0] = ovector[0];
- ovecsave[1] = ovector[1];
- }
-
- /* For altglobal (or first time round the loop), set an "unset" value. */
-
- else ovecsave[0] = ovecsave[1] = PCRE2_UNSET;
-
/* Fill the ovector with junk to detect elements that do not get set
when they should be. */
@@ -7243,12 +7236,23 @@ else for (gmatched = 0;; gmatched++)
}
/* If this is not the first time round a global loop, check that the
- returned string has changed. If not, there is a bug somewhere and we must
- break the loop because it will go on for ever. We know that there are
- always at least two elements in the ovector. */
-
+ returned string has changed. If it has not, check for an empty string match
+ at different starting offset from the previous match. This is a failed test
+ retry for null-matching patterns that don't match at their starting offset,
+ for example /(?<=\G.)/. A repeated match at the same point is not such a
+ pattern, and must be discarded, and we then proceed to seek a non-null
+ match at the current point. For any other repeated match, there is a bug
+ somewhere and we must break the loop because it will go on for ever. We
+ know that there are always at least two elements in the ovector. */
+
if (gmatched > 0 && ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1])
{
+ if (ovector[0] == ovector[1] && ovecsave[2] != dat_datctl.offset)
+ {
+ g_notempty = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
+ ovecsave[2] = dat_datctl.offset;
+ continue; /* Back to the top of the loop */
+ }
fprintf(outfile,
"** PCRE2 error: global repeat returned the same string as previous\n");
fprintf(outfile, "** Global loop abandoned\n");
@@ -7556,6 +7560,7 @@ else for (gmatched = 0;; gmatched++)
if ((dat_datctl.control & CTL_ANYGLOB) == 0) break; else
{
+ PCRE2_SIZE match_offset = FLD(match_data, ovector)[0];
PCRE2_SIZE end_offset = FLD(match_data, ovector)[1];
/* We must now set up for the next iteration of a global search. If we have
@@ -7563,12 +7568,19 @@ else for (gmatched = 0;; gmatched++)
subject. If so, the loop is over. Otherwise, mimic what Perl's /g option
does. Set PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED and try the match again
at the same point. If this fails it will be picked up above, where a fake
- match is set up so that at this point we advance to the next character. */
-
- if (FLD(match_data, ovector)[0] == end_offset)
+ match is set up so that at this point we advance to the next character.
+
+ However, in order to cope with patterns that never match at their starting
+ offset (e.g. /(?<=\G.)/) we don't do this when the match offset is greater
+ than the starting offset. This means there will be a retry with the
+ starting offset at the match offset. If this returns the same match again,
+ it is picked up above and ignored, and the special action is then taken. */
+
+ if (match_offset == end_offset)
{
- if (end_offset == ulen) break; /* End of subject */
- g_notempty = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
+ if (end_offset == ulen) break; /* End of subject */
+ if (match_offset <= dat_datctl.offset)
+ g_notempty = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
}
/* However, even after matching a non-empty string, there is still one
@@ -7606,10 +7618,19 @@ else for (gmatched = 0;; gmatched++)
}
}
- /* For /g (global), update the start offset, leaving the rest alone. */
+ /* For a normal global (/g) iteration, save the current ovector[0,1] and
+ the starting offset so that we can check that they do change each time.
+ Otherwise a matching bug that returns the same string causes an infinite
+ loop. It has happened! Then update the start offset, leaving other
+ parameters alone. */
if ((dat_datctl.control & CTL_GLOBAL) != 0)
+ {
+ ovecsave[0] = ovector[0];
+ ovecsave[1] = ovector[1];
+ ovecsave[2] = dat_datctl.offset;
dat_datctl.offset = end_offset;
+ }
/* For altglobal, just update the pointer and length. */
diff --git a/testdata/testinput1 b/testdata/testinput1
index 9a9c5fd..fb50238 100644
--- a/testdata/testinput1
+++ b/testdata/testinput1
@@ -6189,4 +6189,7 @@ ef) x/x,mark
/(?=a+)a(a+)++b/
aab
+/(?<=\G.)/g,aftertext
+ abc
+
# End of testinput1
diff --git a/testdata/testinput2 b/testdata/testinput2
index 5d3a80e..797b0f7 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -4935,6 +4935,9 @@ a)"xI
//replace=0
\=offset=7
+/(?<=\G.)/g,replace=+
+ abc
+
".+\QX\E+"B,no_auto_possess
".+\QX\E+"B,auto_callout,no_auto_possess
diff --git a/testdata/testoutput1 b/testdata/testoutput1
index 9c55be9..348dcbc 100644
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@@ -9822,4 +9822,13 @@ No match
0: aab
1: a
+/(?<=\G.)/g,aftertext
+ abc
+ 0:
+ 0+ bc
+ 0:
+ 0+ c
+ 0:
+ 0+
+
# End of testinput1
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index fcaac8f..5c13f5b 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -15545,6 +15545,10 @@ Failed: error -57 at offset 2 in replacement: bad escape sequence in replacement
\=offset=7
Failed: error -33: bad offset value
+/(?<=\G.)/g,replace=+
+ abc
+ 3: a+b+c+
+
".+\QX\E+"B,no_auto_possess
------------------------------------------------------------------
Bra
@@ -16576,7 +16580,7 @@ No match
------------------------------------------------------------------
# End of testinput2
-Error -65: PCRE2_ERROR_BADDATA (unknown error number)
+Error -70: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data
Error -2: partial match
Error -1: no match
--
2.14.4