pcre2/pcre2-10.30-Fix-multiple-multiline-matching-issues-in-pcre2grep.patch
2017-11-13 18:22:44 +01:00

384 lines
14 KiB
Diff
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

From eff8c9e117259b1192919b85c2ee03a27b164f1a Mon Sep 17 00:00:00 2001
From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
Date: Mon, 13 Nov 2017 16:52:39 +0000
Subject: [PATCH] Fix multiple multiline matching issues in pcre2grep.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@878 6239d852-aaf2-0410-a92c-79f79f948069
Petr Písař: Ported to 10.30.
diff --git a/RunGrepTest b/RunGrepTest
index cf88c78..a7496cb 100755
--- a/RunGrepTest
+++ b/RunGrepTest
@@ -248,7 +248,7 @@ echo "---------------------------- Test 35 -----------------------------" >>test
echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 36 -----------------------------" >>testtrygrep
-(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include=grepinput --exclude 'grepinput$' --exclude=grepinput8 --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include=grepinput --exclude 'grepinput$' --exclude=grepinput8 --exclude=grepinputM --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep
echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 37 -----------------------------" >>testtrygrep
@@ -391,6 +391,12 @@ echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 70 -----------------------------" >>testtrygrep
(cd $srcdir; $valgrind $vjs $pcre2grep --color=always -M "triple:\t.*\n\n" ./testdata/grepinput3) >>testtrygrep
echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep --color=always -M -n "triple:\t.*\n\n" ./testdata/grepinput3) >>testtrygrep
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -M "triple:\t.*\n\n" ./testdata/grepinput3) >>testtrygrep
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -M -n "triple:\t.*\n\n" ./testdata/grepinput3) >>testtrygrep
+echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 71 -----------------------------" >>testtrygrep
(cd $srcdir; $valgrind $vjs $pcre2grep -o "^01|^02|^03" ./testdata/grepinput) >>testtrygrep
@@ -494,25 +500,25 @@ echo "---------------------------- Test 95 -----------------------------" >>test
echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 96 -----------------------------" >>testtrygrep
-(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include-dir=testdata --exclude '^(?!grepinput)' 'fox' ./test* | sort) >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include-dir=testdata --exclude '^(?!grepinput)' --exclude=grepinputM 'fox' ./test* | sort) >>testtrygrep
echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 97 -----------------------------" >>testtrygrep
echo "grepinput$" >testtemp1grep
echo "grepinput8" >>testtemp1grep
-(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include=grepinput --exclude-from $builddir/testtemp1grep --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include=grepinput --exclude=grepinputM --exclude-from $builddir/testtemp1grep --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep
echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 98 -----------------------------" >>testtrygrep
echo "grepinput$" >testtemp1grep
echo "grepinput8" >>testtemp1grep
-(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --exclude=grepinput3 --include=grepinput --exclude-from $builddir/testtemp1grep --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --exclude=grepinput3 --exclude=grepinputM --include=grepinput --exclude-from $builddir/testtemp1grep --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep
echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 99 -----------------------------" >>testtrygrep
echo "grepinput$" >testtemp1grep
echo "grepinput8" >testtemp2grep
-(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include grepinput --exclude-from $builddir/testtemp1grep --exclude-from=$builddir/testtemp2grep --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include grepinput --exclude=grepinputM --exclude-from $builddir/testtemp1grep --exclude-from=$builddir/testtemp2grep --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep
echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 100 ------------------------------" >>testtrygrep
@@ -582,7 +588,7 @@ echo "---------------------------- Test 115 -----------------------------" >>tes
echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 116 -----------------------------" >>testtrygrep
-(cd $srcdir; $valgrind $vjs $pcre2grep -th 'the' testdata/grepinput*) >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep --exclude=grepinputM -th 'the' testdata/grepinput*) >>testtrygrep
echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 117 -----------------------------" >>testtrygrep
@@ -610,10 +616,20 @@ echo "---------------------------- Test 122 -----------------------------" >>tes
(cd $srcdir; $valgrind $vjs $pcre2grep -w 'cat|dog' testdata/grepinputv) >>testtrygrep
echo "RC=$?" >>testtrygrep
-echo "---------------------------- Test 122 -----------------------------" >>testtrygrep
+echo "---------------------------- Test 123 -----------------------------" >>testtrygrep
(cd $srcdir; $valgrind $vjs $pcre2grep -w 'dog|cat' testdata/grepinputv) >>testtrygrep
echo "RC=$?" >>testtrygrep
+echo "---------------------------- Test 124 -----------------------------" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -Mn --colour=always 'start[\s]+end' testdata/grepinputM) >>testtrygrep
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -Mn --colour=always -A2 'start[\s]+end' testdata/grepinputM) >>testtrygrep
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -Mn 'start[\s]+end' testdata/grepinputM) >>testtrygrep
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -Mn -A2 'start[\s]+end' testdata/grepinputM) >>testtrygrep
+echo "RC=$?" >>testtrygrep
+
# Now compare the results.
diff --git a/src/pcre2grep.c b/src/pcre2grep.c
index 1649d5a..bec07e1 100644
--- a/src/pcre2grep.c
+++ b/src/pcre2grep.c
@@ -2505,7 +2505,10 @@ while (ptr < endptr)
match = match_patterns(ptr, length, options, startoffset, &mrc);
options = PCRE2_NOTEMPTY;
- /* If it's a match or a not-match (as required), do what's wanted. */
+ /* If it's a match or a not-match (as required), do what's wanted. NOTE: Use
+ only FWRITE_IGNORE() - which is just a packaged fwrite() that ignores its
+ return code - to output data lines, so that binary zeroes are treated as just
+ another data character. */
if (match != invert)
{
@@ -2734,27 +2737,6 @@ while (ptr < endptr)
if (printname != NULL) fprintf(stdout, "%s:", printname);
if (number) fprintf(stdout, "%d:", linenumber);
- /* In multiline mode, we want to print to the end of the line in which
- the end of the matched string is found, so we adjust linelength and the
- line number appropriately, but only when there actually was a match
- (invert not set). Because the PCRE2_FIRSTLINE option is set, the start of
- the match will always be before the first newline sequence. */
-
- if (multiline & !invert)
- {
- char *endmatch = ptr + offsets[1];
- t = ptr;
- while (t <= endmatch)
- {
- t = end_of_line(t, endptr, &endlinelength);
- if (t < endmatch) linenumber++; else break;
- }
- linelength = t - ptr - endlinelength;
- }
-
- /*** NOTE: Use only fwrite() to output the data line, so that binary
- zeroes are treated as just another data character. */
-
/* This extra option, for Jeffrey Friedl's debugging requirements,
replaces the matched string, or a specific captured string if it exists,
with X. When this happens, colouring is ignored. */
@@ -2771,20 +2753,48 @@ while (ptr < endptr)
else
#endif
- /* We have to split the line(s) up if colouring, and search for further
- matches, but not of course if the line is a non-match. */
+ /* In multiline mode, or if colouring, we have to split the line(s) up
+ and search for further matches, but not of course if the line is a
+ non-match. In multiline mode this is necessary in case there is another
+ match that spans the end of the current line. When colouring we want to
+ colour all matches. */
- if (do_colour && !invert)
+ if ((multiline || do_colour) && !invert)
{
int plength;
FWRITE_IGNORE(ptr, 1, offsets[0], stdout);
print_match(ptr + offsets[0], offsets[1] - offsets[0]);
for (;;)
{
- startoffset = offsets[1];
- if (startoffset >= linelength + endlinelength ||
- !match_patterns(ptr, length, options, startoffset, &mrc))
- break;
+ startoffset = offsets[1]; /* Advance after previous match. */
+
+ /* If the current match ended past the end of the line (only possible
+ in multiline mode), we must move on to the line in which it did end
+ before searching for more matches. Because the PCRE2_FIRSTLINE option
+ is set, the start of the match will always be before the first
+ newline sequence. */
+
+ while (startoffset > linelength + endlinelength)
+ {
+ ptr += linelength + endlinelength;
+ filepos += (int)(linelength + endlinelength);
+ linenumber++;
+ startoffset -= (int)(linelength + endlinelength);
+ t = end_of_line(ptr, endptr, &endlinelength);
+ linelength = t - ptr - endlinelength;
+ length = (size_t)(endptr - ptr);
+ }
+
+ /* If startoffset is at the exact end of the line it means this
+ complete line was the final part of the match, so there is nothing
+ more to do. */
+
+ if (startoffset == linelength + endlinelength) break;
+
+ /* Otherwise, run a match from within the final line, and if found,
+ loop for any that may follow. */
+
+ if (!match_patterns(ptr, length, options, startoffset, &mrc)) break;
FWRITE_IGNORE(ptr + startoffset, 1, offsets[0] - startoffset, stdout);
print_match(ptr + offsets[0], offsets[1] - offsets[0]);
}
@@ -2797,7 +2807,7 @@ while (ptr < endptr)
if (plength > 0) FWRITE_IGNORE(ptr + startoffset, 1, plength, stdout);
}
- /* Not colouring; no need to search for further matches */
+ /* Not colouring or multiline; no need to search for further matches. */
else FWRITE_IGNORE(ptr, 1, linelength + endlinelength, stdout);
}
diff --git a/testdata/grepinputM b/testdata/grepinputM
new file mode 100644
index 0000000..9119e3d
--- /dev/null
+++ b/testdata/grepinputM
@@ -0,0 +1,17 @@
+Data file for multiline tests of multiple matches.
+
+start end in between start
+end and following
+Other stuff
+
+start end in between start
+end and following start
+end other stuff
+
+start end in between start
+
+end
+
+** These two lines must be last.
+start end in between start
+end
diff --git a/testdata/grepoutput b/testdata/grepoutput
index 52e0d17..7e963fb 100644
--- a/testdata/grepoutput
+++ b/testdata/grepoutput
@@ -487,6 +487,7 @@ RC=0
./testdata/grepinput:456
./testdata/grepinput3:0
./testdata/grepinput8:0
+./testdata/grepinputM:0
./testdata/grepinputv:1
./testdata/grepinputx:0
RC=0
@@ -600,6 +601,33 @@ RC=0
triple: t6_txt s2_tag s_txt p_tag p_txt o_tag o_txt
RC=0
+1:triple: t1_txt s1_tag s_txt p_tag p_txt o_tag o_txt
+
+6:triple: t3_txt s2_tag s_txt p_tag p_txt o_tag o_txt
+
+8:triple: t4_txt s1_tag s_txt p_tag p_txt o_tag o_txt
+
+13:triple: t6_txt s2_tag s_txt p_tag p_txt o_tag o_txt
+
+RC=0
+triple: t1_txt s1_tag s_txt p_tag p_txt o_tag o_txt
+
+triple: t3_txt s2_tag s_txt p_tag p_txt o_tag o_txt
+
+triple: t4_txt s1_tag s_txt p_tag p_txt o_tag o_txt
+
+triple: t6_txt s2_tag s_txt p_tag p_txt o_tag o_txt
+
+RC=0
+1:triple: t1_txt s1_tag s_txt p_tag p_txt o_tag o_txt
+
+6:triple: t3_txt s2_tag s_txt p_tag p_txt o_tag o_txt
+
+8:triple: t4_txt s1_tag s_txt p_tag p_txt o_tag o_txt
+
+13:triple: t6_txt s2_tag s_txt p_tag p_txt o_tag o_txt
+
+RC=0
---------------------------- Test 71 -----------------------------
01
RC=0
@@ -793,21 +821,23 @@ RC=0
37216,12
RC=0
---------------------------- Test 113 -----------------------------
-478
+480
RC=0
---------------------------- Test 114 -----------------------------
testdata/grepinput:469
testdata/grepinput3:0
testdata/grepinput8:0
+testdata/grepinputM:2
testdata/grepinputv:3
testdata/grepinputx:6
-TOTAL:478
+TOTAL:480
RC=0
---------------------------- Test 115 -----------------------------
testdata/grepinput:469
+testdata/grepinputM:2
testdata/grepinputv:3
testdata/grepinputx:6
-TOTAL:478
+TOTAL:480
RC=0
---------------------------- Test 116 -----------------------------
478
@@ -816,9 +846,10 @@ RC=0
469
0
0
+2
3
6
-478
+480
RC=0
---------------------------- Test 118 -----------------------------
testdata/grepinput3
@@ -846,7 +877,62 @@ RC=0
over the lazy dog.
The word is cat in this line
RC=0
----------------------------- Test 122 -----------------------------
+---------------------------- Test 123 -----------------------------
over the lazy dog.
The word is cat in this line
RC=0
+---------------------------- Test 124 -----------------------------
+3:start end in between start
+end and following
+7:start end in between start
+end and following start
+end other stuff
+11:start end in between start
+
+end
+16:start end in between start
+end
+RC=0
+3:start end in between start
+end and following
+5-Other stuff
+6-
+7:start end in between start
+end and following start
+end other stuff
+10-
+11:start end in between start
+
+end
+14-
+15-** These two lines must be last.
+16:start end in between start
+end
+RC=0
+3:start end in between start
+end and following
+7:start end in between start
+end and following start
+end other stuff
+11:start end in between start
+
+end
+16:start end in between start
+end
+RC=0
+3:start end in between start
+end and following
+5-Other stuff
+6-
+7:start end in between start
+end and following start
+end other stuff
+10-
+11:start end in between start
+
+end
+14-
+15-** These two lines must be last.
+16:start end in between start
+end
+RC=0
--
2.13.6