From de4342d19f35c31160d8762f99cf22bc9ad5be87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20P=C3=ADsa=C5=99?= Date: Mon, 2 Jul 2018 13:34:34 +0200 Subject: [PATCH] Fix global search/replace in pcre2test and pcre2_substitute() when the pattern matches an empty string, but never at the starting offset --- ...h-replace-in-pcre2test-and-pcre2_sub.patch | 708 ++++++++++++++++++ pcre2.spec | 11 +- 2 files changed, 718 insertions(+), 1 deletion(-) create mode 100644 pcre2-10.31-Fix-global-search-replace-in-pcre2test-and-pcre2_sub.patch diff --git a/pcre2-10.31-Fix-global-search-replace-in-pcre2test-and-pcre2_sub.patch b/pcre2-10.31-Fix-global-search-replace-in-pcre2test-and-pcre2_sub.patch new file mode 100644 index 0000000..69f190e --- /dev/null +++ b/pcre2-10.31-Fix-global-search-replace-in-pcre2test-and-pcre2_sub.patch @@ -0,0 +1,708 @@ +From 7729d10594572b5e5a3ebfa89064cc176ba50c7e Mon Sep 17 00:00:00 2001 +From: ph10 +Date: Mon, 2 Jul 2018 10:54:03 +0000 +Subject: [PATCH] Fix global search/replace in pcre2test and pcre2_substitute() + when the pattern matches an empty string, but never at the starting offset. +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@955 6239d852-aaf2-0410-a92c-79f79f948069 +Petr Písař: Ported to 10.31. + +Signed-off-by: Petr Písař +--- + RunTest | 2 +- + doc/html/pcre2api.html | 5 +- + doc/html/pcre2pattern.html | 5 +- + doc/pcre2.txt | 175 ++++++++++++++++++++++++--------------------- + doc/pcre2api.3 | 5 +- + src/pcre2.h.in | 3 +- + src/pcre2_error.c | 4 +- + src/pcre2_substitute.c | 41 +++++++++-- + src/pcre2test.c | 77 ++++++++++++-------- + testdata/testinput1 | 3 + + testdata/testinput2 | 3 + + testdata/testoutput1 | 9 +++ + testdata/testoutput2 | 6 +- + 13 files changed, 214 insertions(+), 124 deletions(-) + +diff --git a/RunTest b/RunTest +index bc912da..f20f194 100755 +--- a/RunTest ++++ b/RunTest +@@ -500,7 +500,7 @@ for bmode in "$test8" "$test16" "$test32"; do + for opt in "" $jitopt; do + $sim $valgrind ${opt:+$vjs} ./pcre2test -q $setstack $bmode $opt $testdata/testinput2 testtry + if [ $? = 0 ] ; then +- $sim $valgrind ${opt:+$vjs} ./pcre2test -q $bmode $opt -error -65,-62,-2,-1,0,100,101,191,200 >>testtry ++ $sim $valgrind ${opt:+$vjs} ./pcre2test -q $bmode $opt -error -70,-62,-2,-1,0,100,101,191,200 >>testtry + checkresult $? 2 "$opt" + fi + done +diff --git a/doc/html/pcre2api.html b/doc/html/pcre2api.html +index ba3b2ca..daa32a9 100644 +--- a/doc/html/pcre2api.html ++++ b/doc/html/pcre2api.html +@@ -3108,7 +3108,10 @@ string in outputbuffer, replacing the part that was matched with the + replacement string, whose length is supplied in rlength. This can + be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in + which a \K item in a lookahead in the pattern causes the match to end before +-it starts are not supported, and give rise to an error return. ++it starts are not supported, and give rise to an error return. For global ++replacements, matches in which \K in a lookbehind causes the match to start ++earlier than the point that was reached in the previous iteration are also not ++supported. +

+

+ The first seven arguments of pcre2_substitute() are the same as for +diff --git a/doc/html/pcre2pattern.html b/doc/html/pcre2pattern.html +index c495cba..bc07e8b 100644 +--- a/doc/html/pcre2pattern.html ++++ b/doc/html/pcre2pattern.html +@@ -1082,8 +1082,9 @@ sequences but the characters that they represent.) + Resetting the match start +
+

+-The escape sequence \K causes any previously matched characters not to be +-included in the final matched sequence. For example, the pattern: ++In normal use, the escape sequence \K causes any previously matched characters ++not to be included in the final matched sequence that is returned. For example, ++the pattern: +

+   foo\Kbar
+ 
+diff --git a/doc/pcre2.txt b/doc/pcre2.txt +index 79d94e3..a82f857 100644 +--- a/doc/pcre2.txt ++++ b/doc/pcre2.txt +@@ -3014,75 +3014,78 @@ CREATING A NEW STRING WITH SUBSTITUTIONS + replacement string, whose length is supplied in rlength. This can be + given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in + which a \K item in a lookahead in the pattern causes the match to end +- before it starts are not supported, and give rise to an error return. ++ before it starts are not supported, and give rise to an error return. ++ For global replacements, matches in which \K in a lookbehind causes the ++ match to start earlier than the point that was reached in the previous ++ iteration are also not supported. + +- The first seven arguments of pcre2_substitute() are the same as for ++ The first seven arguments of pcre2_substitute() are the same as for + pcre2_match(), except that the partial matching options are not permit- +- ted, and match_data may be passed as NULL, in which case a match data +- block is obtained and freed within this function, using memory manage- +- ment functions from the match context, if provided, or else those that ++ ted, and match_data may be passed as NULL, in which case a match data ++ block is obtained and freed within this function, using memory manage- ++ ment functions from the match context, if provided, or else those that + were used to allocate memory for the compiled code. + +- The outlengthptr argument must point to a variable that contains the +- length, in code units, of the output buffer. If the function is suc- +- cessful, the value is updated to contain the length of the new string, ++ The outlengthptr argument must point to a variable that contains the ++ length, in code units, of the output buffer. If the function is suc- ++ cessful, the value is updated to contain the length of the new string, + excluding the trailing zero that is automatically added. + +- If the function is not successful, the value set via outlengthptr +- depends on the type of error. For syntax errors in the replacement +- string, the value is the offset in the replacement string where the +- error was detected. For other errors, the value is PCRE2_UNSET by +- default. This includes the case of the output buffer being too small, +- unless PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set (see below), in which +- case the value is the minimum length needed, including space for the +- trailing zero. Note that in order to compute the required length, +- pcre2_substitute() has to simulate all the matching and copying, ++ If the function is not successful, the value set via outlengthptr ++ depends on the type of error. For syntax errors in the replacement ++ string, the value is the offset in the replacement string where the ++ error was detected. For other errors, the value is PCRE2_UNSET by ++ default. This includes the case of the output buffer being too small, ++ unless PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set (see below), in which ++ case the value is the minimum length needed, including space for the ++ trailing zero. Note that in order to compute the required length, ++ pcre2_substitute() has to simulate all the matching and copying, + instead of giving an error return as soon as the buffer overflows. Note + also that the length is in code units, not bytes. + +- In the replacement string, which is interpreted as a UTF string in UTF +- mode, and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK ++ In the replacement string, which is interpreted as a UTF string in UTF ++ mode, and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK + option is set, a dollar character is an escape character that can spec- +- ify the insertion of characters from capturing groups or (*MARK), +- (*PRUNE), or (*THEN) items in the pattern. The following forms are ++ ify the insertion of characters from capturing groups or (*MARK), ++ (*PRUNE), or (*THEN) items in the pattern. The following forms are + always recognized: + + $$ insert a dollar character + $ or ${} insert the contents of group + $*MARK or ${*MARK} insert a (*MARK), (*PRUNE), or (*THEN) name + +- Either a group number or a group name can be given for . Curly +- brackets are required only if the following character would be inter- ++ Either a group number or a group name can be given for . Curly ++ brackets are required only if the following character would be inter- + preted as part of the number or name. The number may be zero to include +- the entire matched string. For example, if the pattern a(b)c is +- matched with "=abc=" and the replacement string "+$1$0$1+", the result ++ the entire matched string. For example, if the pattern a(b)c is ++ matched with "=abc=" and the replacement string "+$1$0$1+", the result + is "=+babcb+=". + + $*MARK inserts the name from the last encountered (*MARK), (*PRUNE), or +- (*THEN) on the matching path that has a name. (*MARK) must always +- include a name, but (*PRUNE) and (*THEN) need not. For example, in the +- case of (*MARK:A)(*PRUNE) the name inserted is "A", but for +- (*MARK:A)(*PRUNE:B) the relevant name is "B". This facility can be +- used to perform simple simultaneous substitutions, as this pcre2test ++ (*THEN) on the matching path that has a name. (*MARK) must always ++ include a name, but (*PRUNE) and (*THEN) need not. For example, in the ++ case of (*MARK:A)(*PRUNE) the name inserted is "A", but for ++ (*MARK:A)(*PRUNE:B) the relevant name is "B". This facility can be ++ used to perform simple simultaneous substitutions, as this pcre2test + example shows: + + /(*MARK:pear)apple|(*MARK:orange)lemon/g,replace=${*MARK} + apple lemon + 2: pear orange + +- As well as the usual options for pcre2_match(), a number of additional ++ As well as the usual options for pcre2_match(), a number of additional + options can be set in the options argument of pcre2_substitute(). + + PCRE2_SUBSTITUTE_GLOBAL causes the function to iterate over the subject +- string, replacing every matching substring. If this option is not set, +- only the first matching substring is replaced. The search for matches +- takes place in the original subject string (that is, previous replace- +- ments do not affect it). Iteration is implemented by advancing the +- startoffset value for each search, which is always passed the entire ++ string, replacing every matching substring. If this option is not set, ++ only the first matching substring is replaced. The search for matches ++ takes place in the original subject string (that is, previous replace- ++ ments do not affect it). Iteration is implemented by advancing the ++ startoffset value for each search, which is always passed the entire + subject string. If an offset limit is set in the match context, search- + ing stops when that limit is reached. + +- You can restrict the effect of a global substitution to a portion of ++ You can restrict the effect of a global substitution to a portion of + the subject string by setting either or both of startoffset and an off- + set limit. Here is a pcre2test example: + +@@ -3090,87 +3093,87 @@ CREATING A NEW STRING WITH SUBSTITUTIONS + ABC ABC ABC ABC\=offset=3,offset_limit=12 + 2: ABC A!C A!C ABC + +- When continuing with global substitutions after matching a substring ++ When continuing with global substitutions after matching a substring + with zero length, an attempt to find a non-empty match at the same off- + set is performed. If this is not successful, the offset is advanced by + one character except when CRLF is a valid newline sequence and the next +- two characters are CR, LF. In this case, the offset is advanced by two ++ two characters are CR, LF. In this case, the offset is advanced by two + characters. + +- PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output ++ PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output + buffer is too small. The default action is to return PCRE2_ERROR_NOMEM- +- ORY immediately. If this option is set, however, pcre2_substitute() ++ ORY immediately. If this option is set, however, pcre2_substitute() + continues to go through the motions of matching and substituting (with- +- out, of course, writing anything) in order to compute the size of buf- +- fer that is needed. This value is passed back via the outlengthptr +- variable, with the result of the function still being ++ out, of course, writing anything) in order to compute the size of buf- ++ fer that is needed. This value is passed back via the outlengthptr ++ variable, with the result of the function still being + PCRE2_ERROR_NOMEMORY. + +- Passing a buffer size of zero is a permitted way of finding out how +- much memory is needed for given substitution. However, this does mean ++ Passing a buffer size of zero is a permitted way of finding out how ++ much memory is needed for given substitution. However, this does mean + that the entire operation is carried out twice. Depending on the appli- +- cation, it may be more efficient to allocate a large buffer and free +- the excess afterwards, instead of using PCRE2_SUBSTITUTE_OVER- ++ cation, it may be more efficient to allocate a large buffer and free ++ the excess afterwards, instead of using PCRE2_SUBSTITUTE_OVER- + FLOW_LENGTH. + +- PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capturing groups +- that do not appear in the pattern to be treated as unset groups. This +- option should be used with care, because it means that a typo in a +- group name or number no longer causes the PCRE2_ERROR_NOSUBSTRING ++ PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capturing groups ++ that do not appear in the pattern to be treated as unset groups. This ++ option should be used with care, because it means that a typo in a ++ group name or number no longer causes the PCRE2_ERROR_NOSUBSTRING + error. + +- PCRE2_SUBSTITUTE_UNSET_EMPTY causes unset capturing groups (including ++ PCRE2_SUBSTITUTE_UNSET_EMPTY causes unset capturing groups (including + unknown groups when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) to be +- treated as empty strings when inserted as described above. If this +- option is not set, an attempt to insert an unset group causes the +- PCRE2_ERROR_UNSET error. This option does not influence the extended ++ treated as empty strings when inserted as described above. If this ++ option is not set, an attempt to insert an unset group causes the ++ PCRE2_ERROR_UNSET error. This option does not influence the extended + substitution syntax described below. + +- PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to the +- replacement string. Without this option, only the dollar character is +- special, and only the group insertion forms listed above are valid. ++ PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to the ++ replacement string. Without this option, only the dollar character is ++ special, and only the group insertion forms listed above are valid. + When PCRE2_SUBSTITUTE_EXTENDED is set, two things change: + +- Firstly, backslash in a replacement string is interpreted as an escape ++ Firstly, backslash in a replacement string is interpreted as an escape + character. The usual forms such as \n or \x{ddd} can be used to specify +- particular character codes, and backslash followed by any non-alphanu- +- meric character quotes that character. Extended quoting can be coded ++ particular character codes, and backslash followed by any non-alphanu- ++ meric character quotes that character. Extended quoting can be coded + using \Q...\E, exactly as in pattern strings. + +- There are also four escape sequences for forcing the case of inserted +- letters. The insertion mechanism has three states: no case forcing, ++ There are also four escape sequences for forcing the case of inserted ++ letters. The insertion mechanism has three states: no case forcing, + force upper case, and force lower case. The escape sequences change the + current state: \U and \L change to upper or lower case forcing, respec- +- tively, and \E (when not terminating a \Q quoted sequence) reverts to +- no case forcing. The sequences \u and \l force the next character (if +- it is a letter) to upper or lower case, respectively, and then the ++ tively, and \E (when not terminating a \Q quoted sequence) reverts to ++ no case forcing. The sequences \u and \l force the next character (if ++ it is a letter) to upper or lower case, respectively, and then the + state automatically reverts to no case forcing. Case forcing applies to + all inserted characters, including those from captured groups and let- + ters within \Q...\E quoted sequences. + + Note that case forcing sequences such as \U...\E do not nest. For exam- +- ple, the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final ++ ple, the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final + \E has no effect. + +- The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more +- flexibility to group substitution. The syntax is similar to that used ++ The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more ++ flexibility to group substitution. The syntax is similar to that used + by Bash: + + ${:-} + ${:+:} + +- As before, may be a group number or a name. The first form speci- +- fies a default value. If group is set, its value is inserted; if +- not, is expanded and the result inserted. The second form +- specifies strings that are expanded and inserted when group is set +- or unset, respectively. The first form is just a convenient shorthand ++ As before, may be a group number or a name. The first form speci- ++ fies a default value. If group is set, its value is inserted; if ++ not, is expanded and the result inserted. The second form ++ specifies strings that are expanded and inserted when group is set ++ or unset, respectively. The first form is just a convenient shorthand + for + + ${:+${}:} + +- Backslash can be used to escape colons and closing curly brackets in +- the replacement strings. A change of the case forcing state within a +- replacement string remains in force afterwards, as shown in this ++ Backslash can be used to escape colons and closing curly brackets in ++ the replacement strings. A change of the case forcing state within a ++ replacement string remains in force afterwards, as shown in this + pcre2test example: + + /(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo +@@ -6614,8 +6617,9 @@ BACKSLASH + + Resetting the match start + +- The escape sequence \K causes any previously matched characters not to +- be included in the final matched sequence. For example, the pattern: ++ In normal use, the escape sequence \K causes any previously matched ++ characters not to be included in the final matched sequence that is ++ returned. For example, the pattern: + + foo\Kbar + +@@ -6634,7 +6638,16 @@ BACKSLASH + defined". In PCRE2, \K is acted upon when it occurs inside positive + assertions, but is ignored in negative assertions. Note that when a + pattern such as (?=ab\K) matches, the reported start of the match can +- be greater than the end of the match. ++ be greater than the end of the match. Using \K in a lookbehind asser- ++ tion at the start of a pattern can also lead to odd effects. For exam- ++ ple, consider this pattern: ++ ++ (?<=\Kfoo)bar ++ ++ If the subject is "foobar", a call to pcre2_match() with a starting ++ offset of 3 succeeds and reports the matching string as "foobar", that ++ is, the start of the reported match is earlier than where the match ++ started. + + Simple assertions + +diff --git a/doc/pcre2api.3 b/doc/pcre2api.3 +index 786b314..57b6d31 100644 +--- a/doc/pcre2api.3 ++++ b/doc/pcre2api.3 +@@ -3122,7 +3122,10 @@ string in \fIoutputbuffer\fP, replacing the part that was matched with the + \fIreplacement\fP string, whose length is supplied in \fBrlength\fP. This can + be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in + which a \eK item in a lookahead in the pattern causes the match to end before +-it starts are not supported, and give rise to an error return. ++it starts are not supported, and give rise to an error return. For global ++replacements, matches in which \eK in a lookbehind causes the match to start ++earlier than the point that was reached in the previous iteration are also not ++supported. + .P + The first seven arguments of \fBpcre2_substitute()\fP are the same as for + \fBpcre2_match()\fP, except that the partial matching options are not +diff --git a/src/pcre2.h.in b/src/pcre2.h.in +index a3a3fa6..0bc8cca 100644 +--- a/src/pcre2.h.in ++++ b/src/pcre2.h.in +@@ -5,7 +5,7 @@ + /* This is the public header file for the PCRE library, second API, to be + #included by applications that call PCRE2 functions. + +- Copyright (c) 2016-2017 University of Cambridge ++ Copyright (c) 2016-2018 University of Cambridge + + ----------------------------------------------------------------------------- + Redistribution and use in source and binary forms, with or without +@@ -387,6 +387,7 @@ released, the numbers must not be changed. */ + #define PCRE2_ERROR_BADSERIALIZEDDATA (-62) + #define PCRE2_ERROR_HEAPLIMIT (-63) + #define PCRE2_ERROR_CONVERT_SYNTAX (-64) ++#define PCRE2_ERROR_INTERNAL_DUPMATCH (-65) + + + /* Request types for pcre2_pattern_info() */ +diff --git a/src/pcre2_error.c b/src/pcre2_error.c +index d98cae9..dce1efb 100644 +--- a/src/pcre2_error.c ++++ b/src/pcre2_error.c +@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge +- New API code Copyright (c) 2016-2017 University of Cambridge ++ New API code Copyright (c) 2016-2018 University of Cambridge + + ----------------------------------------------------------------------------- + Redistribution and use in source and binary forms, with or without +@@ -260,6 +260,8 @@ static const unsigned char match_error_texts[] = + "bad serialized data\0" + "heap limit exceeded\0" + "invalid syntax\0" ++ /* 65 */ ++ "internal error - duplicate substitution match\0" + ; + + +diff --git a/src/pcre2_substitute.c b/src/pcre2_substitute.c +index 8da951f..582a42d 100644 +--- a/src/pcre2_substitute.c ++++ b/src/pcre2_substitute.c +@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge +- New API code Copyright (c) 2016 University of Cambridge ++ New API code Copyright (c) 2016-2018 University of Cambridge + + ----------------------------------------------------------------------------- + Redistribution and use in source and binary forms, with or without +@@ -238,10 +238,12 @@ PCRE2_SPTR repend; + PCRE2_SIZE extra_needed = 0; + PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength; + PCRE2_SIZE *ovector; ++PCRE2_SIZE ovecsave[3]; + + buff_offset = 0; + lengthleft = buff_length = *blength; + *blength = PCRE2_UNSET; ++ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET; + + /* Partial matching is not valid. */ + +@@ -368,6 +370,26 @@ do + rc = PCRE2_ERROR_BADSUBSPATTERN; + goto EXIT; + } ++ ++ /* Check for the same match as previous. This is legitimate after matching an ++ empty string that starts after the initial match offset. We have tried again ++ at the match point in case the pattern is one like /(?<=\G.)/ which can never ++ match at its starting point, so running the match achieves the bumpalong. If ++ we do get the same (null) match at the original match point, it isn't such a ++ pattern, so we now do the empty string magic. In all other cases, a repeat ++ match should never occur. */ ++ ++ if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1]) ++ { ++ if (ovector[0] == ovector[1] && ovecsave[2] != start_offset) ++ { ++ goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; ++ ovecsave[2] = start_offset; ++ continue; /* Back to the top of the loop */ ++ } ++ rc = PCRE2_ERROR_INTERNAL_DUPMATCH; ++ goto EXIT; ++ } + + /* Count substitutions with a paranoid check for integer overflow; surely no + real call to this function would ever hit this! */ +@@ -799,13 +821,18 @@ do + } /* End handling a literal code unit */ + } /* End of loop for scanning the replacement. */ + +- /* The replacement has been copied to the output. Update the start offset to +- point to the rest of the subject string. If we matched an empty string, +- do the magic for global matches. */ +- +- start_offset = ovector[1]; +- goptions = (ovector[0] != ovector[1])? 0 : ++ /* The replacement has been copied to the output. Save the details of this ++ match. See above for how this data is used. If we matched an empty string, do ++ the magic for global matches. Finally, update the start offset to point to ++ the rest of the subject string. */ ++ ++ ovecsave[0] = ovector[0]; ++ ovecsave[1] = ovector[1]; ++ ovecsave[2] = start_offset; ++ ++ goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 : + PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART; ++ start_offset = ovector[1]; + } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0); /* Repeat "do" loop */ + + /* Copy the rest of the subject. */ +diff --git a/src/pcre2test.c b/src/pcre2test.c +index ad3db2c..d83aa43 100644 +--- a/src/pcre2test.c ++++ b/src/pcre2test.c +@@ -6283,6 +6283,7 @@ size_t needlen; + void *use_dat_context; + BOOL utf; + BOOL subject_literal; ++PCRE2_SIZE ovecsave[3]; + + #ifdef SUPPORT_PCRE2_8 + uint8_t *q8 = NULL; +@@ -6929,6 +6930,9 @@ if (dat_datctl.replacement[0] != 0) + + if (timeitm) + fprintf(outfile, "** Timing is not supported with replace: ignored\n"); ++ ++ if ((dat_datctl.control & CTL_ALTGLOBAL) != 0) ++ fprintf(outfile, "** Altglobal is not supported with replace: ignored\n"); + + xoptions = (((dat_datctl.control & CTL_GLOBAL) == 0)? 0 : + PCRE2_SUBSTITUTE_GLOBAL) | +@@ -7048,35 +7052,24 @@ if (dat_datctl.replacement[0] != 0) + } + + fprintf(outfile, "\n"); ++ show_memory = FALSE; ++ return PR_OK; + } /* End of substitution handling */ + + /* When a replacement string is not provided, run a loop for global matching +-with one of the basic matching functions. */ ++with one of the basic matching functions. For altglobal (or first time round ++the loop), set an "unset" value for the previous match info. */ ++ ++ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET; + +-else for (gmatched = 0;; gmatched++) ++for (gmatched = 0;; gmatched++) + { + PCRE2_SIZE j; + int capcount; + PCRE2_SIZE *ovector; +- PCRE2_SIZE ovecsave[2]; + + ovector = FLD(match_data, ovector); + +- /* After the first time round a global loop, for a normal global (/g) +- iteration, save the current ovector[0,1] so that we can check that they do +- change each time. Otherwise a matching bug that returns the same string +- causes an infinite loop. It has happened! */ +- +- if (gmatched > 0 && (dat_datctl.control & CTL_GLOBAL) != 0) +- { +- ovecsave[0] = ovector[0]; +- ovecsave[1] = ovector[1]; +- } +- +- /* For altglobal (or first time round the loop), set an "unset" value. */ +- +- else ovecsave[0] = ovecsave[1] = PCRE2_UNSET; +- + /* Fill the ovector with junk to detect elements that do not get set + when they should be. */ + +@@ -7243,12 +7236,23 @@ else for (gmatched = 0;; gmatched++) + } + + /* If this is not the first time round a global loop, check that the +- returned string has changed. If not, there is a bug somewhere and we must +- break the loop because it will go on for ever. We know that there are +- always at least two elements in the ovector. */ +- ++ returned string has changed. If it has not, check for an empty string match ++ at different starting offset from the previous match. This is a failed test ++ retry for null-matching patterns that don't match at their starting offset, ++ for example /(?<=\G.)/. A repeated match at the same point is not such a ++ pattern, and must be discarded, and we then proceed to seek a non-null ++ match at the current point. For any other repeated match, there is a bug ++ somewhere and we must break the loop because it will go on for ever. We ++ know that there are always at least two elements in the ovector. */ ++ + if (gmatched > 0 && ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1]) + { ++ if (ovector[0] == ovector[1] && ovecsave[2] != dat_datctl.offset) ++ { ++ g_notempty = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; ++ ovecsave[2] = dat_datctl.offset; ++ continue; /* Back to the top of the loop */ ++ } + fprintf(outfile, + "** PCRE2 error: global repeat returned the same string as previous\n"); + fprintf(outfile, "** Global loop abandoned\n"); +@@ -7556,6 +7560,7 @@ else for (gmatched = 0;; gmatched++) + + if ((dat_datctl.control & CTL_ANYGLOB) == 0) break; else + { ++ PCRE2_SIZE match_offset = FLD(match_data, ovector)[0]; + PCRE2_SIZE end_offset = FLD(match_data, ovector)[1]; + + /* We must now set up for the next iteration of a global search. If we have +@@ -7563,12 +7568,19 @@ else for (gmatched = 0;; gmatched++) + subject. If so, the loop is over. Otherwise, mimic what Perl's /g option + does. Set PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED and try the match again + at the same point. If this fails it will be picked up above, where a fake +- match is set up so that at this point we advance to the next character. */ +- +- if (FLD(match_data, ovector)[0] == end_offset) ++ match is set up so that at this point we advance to the next character. ++ ++ However, in order to cope with patterns that never match at their starting ++ offset (e.g. /(?<=\G.)/) we don't do this when the match offset is greater ++ than the starting offset. This means there will be a retry with the ++ starting offset at the match offset. If this returns the same match again, ++ it is picked up above and ignored, and the special action is then taken. */ ++ ++ if (match_offset == end_offset) + { +- if (end_offset == ulen) break; /* End of subject */ +- g_notempty = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; ++ if (end_offset == ulen) break; /* End of subject */ ++ if (match_offset <= dat_datctl.offset) ++ g_notempty = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; + } + + /* However, even after matching a non-empty string, there is still one +@@ -7606,10 +7618,19 @@ else for (gmatched = 0;; gmatched++) + } + } + +- /* For /g (global), update the start offset, leaving the rest alone. */ ++ /* For a normal global (/g) iteration, save the current ovector[0,1] and ++ the starting offset so that we can check that they do change each time. ++ Otherwise a matching bug that returns the same string causes an infinite ++ loop. It has happened! Then update the start offset, leaving other ++ parameters alone. */ + + if ((dat_datctl.control & CTL_GLOBAL) != 0) ++ { ++ ovecsave[0] = ovector[0]; ++ ovecsave[1] = ovector[1]; ++ ovecsave[2] = dat_datctl.offset; + dat_datctl.offset = end_offset; ++ } + + /* For altglobal, just update the pointer and length. */ + +diff --git a/testdata/testinput1 b/testdata/testinput1 +index 9a9c5fd..fb50238 100644 +--- a/testdata/testinput1 ++++ b/testdata/testinput1 +@@ -6189,4 +6189,7 @@ ef) x/x,mark + /(?=a+)a(a+)++b/ + aab + ++/(?<=\G.)/g,aftertext ++ abc ++ + # End of testinput1 +diff --git a/testdata/testinput2 b/testdata/testinput2 +index 5d3a80e..797b0f7 100644 +--- a/testdata/testinput2 ++++ b/testdata/testinput2 +@@ -4935,6 +4935,9 @@ a)"xI + //replace=0 + \=offset=7 + ++/(?<=\G.)/g,replace=+ ++ abc ++ + ".+\QX\E+"B,no_auto_possess + + ".+\QX\E+"B,auto_callout,no_auto_possess +diff --git a/testdata/testoutput1 b/testdata/testoutput1 +index 9c55be9..348dcbc 100644 +--- a/testdata/testoutput1 ++++ b/testdata/testoutput1 +@@ -9822,4 +9822,13 @@ No match + 0: aab + 1: a + ++/(?<=\G.)/g,aftertext ++ abc ++ 0: ++ 0+ bc ++ 0: ++ 0+ c ++ 0: ++ 0+ ++ + # End of testinput1 +diff --git a/testdata/testoutput2 b/testdata/testoutput2 +index fcaac8f..5c13f5b 100644 +--- a/testdata/testoutput2 ++++ b/testdata/testoutput2 +@@ -15545,6 +15545,10 @@ Failed: error -57 at offset 2 in replacement: bad escape sequence in replacement + \=offset=7 + Failed: error -33: bad offset value + ++/(?<=\G.)/g,replace=+ ++ abc ++ 3: a+b+c+ ++ + ".+\QX\E+"B,no_auto_possess + ------------------------------------------------------------------ + Bra +@@ -16576,7 +16580,7 @@ No match + ------------------------------------------------------------------ + + # End of testinput2 +-Error -65: PCRE2_ERROR_BADDATA (unknown error number) ++Error -70: PCRE2_ERROR_BADDATA (unknown error number) + Error -62: bad serialized data + Error -2: partial match + Error -1: no match +-- +2.14.4 + diff --git a/pcre2.spec b/pcre2.spec index 00080dc..0fe7224 100644 --- a/pcre2.spec +++ b/pcre2.spec @@ -9,7 +9,7 @@ #%%global rcversion RC1 Name: pcre2 Version: 10.31 -Release: %{?rcversion:0.}5%{?rcversion:.%rcversion}%{?dist} +Release: %{?rcversion:0.}6%{?rcversion:.%rcversion}%{?dist} %global myversion %{version}%{?rcversion:-%rcversion} Summary: Perl-compatible regular expression library # the library: BSD with exceptions @@ -71,6 +71,10 @@ Patch7: pcre2-10.31-Set-error-offset-zero-for-early-errors-in-pcre2_patt.pat # Fix bug when \K is used in a lookbehind in a substitute pattern, # in upstream after 10.31 Patch8: pcre2-10.31-Fix-bug-when-K-is-used-in-a-lookbehind-in-a-substitu.patch +# Fix global search/replace in pcre2test and pcre2_substitute() when the pattern +# matches an empty string, but never at the starting offset, +# in upstream after 10.31 +Patch9: pcre2-10.31-Fix-global-search-replace-in-pcre2test-and-pcre2_sub.patch BuildRequires: autoconf BuildRequires: automake BuildRequires: coreutils @@ -155,6 +159,7 @@ Utilities demonstrating PCRE2 capabilities like pcre2grep or pcre2test. %patch6 -p1 %patch7 -p1 %patch8 -p1 +%patch9 -p1 # Because of multilib patch libtoolize --copy --force autoreconf -vif @@ -257,6 +262,10 @@ make %{?_smp_mflags} check VERBOSE=yes %{_mandir}/man1/pcre2test.* %changelog +* Mon Jul 02 2018 Petr Pisar - 10.31-6 +- Fix global search/replace in pcre2test and pcre2_substitute() when the pattern + matches an empty string, but never at the starting offset + * Mon Jun 25 2018 Petr Pisar - 10.31-5 - Fix bug when \K is used in a lookbehind in a substitute pattern