From eff8c9e117259b1192919b85c2ee03a27b164f1a Mon Sep 17 00:00:00 2001 From: ph10 Date: Mon, 13 Nov 2017 16:52:39 +0000 Subject: [PATCH] Fix multiple multiline matching issues in pcre2grep. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@878 6239d852-aaf2-0410-a92c-79f79f948069 Petr Písař: Ported to 10.30. diff --git a/RunGrepTest b/RunGrepTest index cf88c78..a7496cb 100755 --- a/RunGrepTest +++ b/RunGrepTest @@ -248,7 +248,7 @@ echo "---------------------------- Test 35 -----------------------------" >>test echo "RC=$?" >>testtrygrep echo "---------------------------- Test 36 -----------------------------" >>testtrygrep -(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include=grepinput --exclude 'grepinput$' --exclude=grepinput8 --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep +(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include=grepinput --exclude 'grepinput$' --exclude=grepinput8 --exclude=grepinputM --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 37 -----------------------------" >>testtrygrep @@ -391,6 +391,12 @@ echo "RC=$?" >>testtrygrep echo "---------------------------- Test 70 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep --color=always -M "triple:\t.*\n\n" ./testdata/grepinput3) >>testtrygrep echo "RC=$?" >>testtrygrep +(cd $srcdir; $valgrind $vjs $pcre2grep --color=always -M -n "triple:\t.*\n\n" ./testdata/grepinput3) >>testtrygrep +echo "RC=$?" >>testtrygrep +(cd $srcdir; $valgrind $vjs $pcre2grep -M "triple:\t.*\n\n" ./testdata/grepinput3) >>testtrygrep +echo "RC=$?" >>testtrygrep +(cd $srcdir; $valgrind $vjs $pcre2grep -M -n "triple:\t.*\n\n" ./testdata/grepinput3) >>testtrygrep +echo "RC=$?" >>testtrygrep echo "---------------------------- Test 71 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -o "^01|^02|^03" ./testdata/grepinput) >>testtrygrep @@ -494,25 +500,25 @@ echo "---------------------------- Test 95 -----------------------------" >>test echo "RC=$?" >>testtrygrep echo "---------------------------- Test 96 -----------------------------" >>testtrygrep -(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include-dir=testdata --exclude '^(?!grepinput)' 'fox' ./test* | sort) >>testtrygrep +(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include-dir=testdata --exclude '^(?!grepinput)' --exclude=grepinputM 'fox' ./test* | sort) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 97 -----------------------------" >>testtrygrep echo "grepinput$" >testtemp1grep echo "grepinput8" >>testtemp1grep -(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include=grepinput --exclude-from $builddir/testtemp1grep --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep +(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include=grepinput --exclude=grepinputM --exclude-from $builddir/testtemp1grep --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 98 -----------------------------" >>testtrygrep echo "grepinput$" >testtemp1grep echo "grepinput8" >>testtemp1grep -(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --exclude=grepinput3 --include=grepinput --exclude-from $builddir/testtemp1grep --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep +(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --exclude=grepinput3 --exclude=grepinputM --include=grepinput --exclude-from $builddir/testtemp1grep --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 99 -----------------------------" >>testtrygrep echo "grepinput$" >testtemp1grep echo "grepinput8" >testtemp2grep -(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include grepinput --exclude-from $builddir/testtemp1grep --exclude-from=$builddir/testtemp2grep --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep +(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include grepinput --exclude=grepinputM --exclude-from $builddir/testtemp1grep --exclude-from=$builddir/testtemp2grep --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 100 ------------------------------" >>testtrygrep @@ -582,7 +588,7 @@ echo "---------------------------- Test 115 -----------------------------" >>tes echo "RC=$?" >>testtrygrep echo "---------------------------- Test 116 -----------------------------" >>testtrygrep -(cd $srcdir; $valgrind $vjs $pcre2grep -th 'the' testdata/grepinput*) >>testtrygrep +(cd $srcdir; $valgrind $vjs $pcre2grep --exclude=grepinputM -th 'the' testdata/grepinput*) >>testtrygrep echo "RC=$?" >>testtrygrep echo "---------------------------- Test 117 -----------------------------" >>testtrygrep @@ -610,10 +616,20 @@ echo "---------------------------- Test 122 -----------------------------" >>tes (cd $srcdir; $valgrind $vjs $pcre2grep -w 'cat|dog' testdata/grepinputv) >>testtrygrep echo "RC=$?" >>testtrygrep -echo "---------------------------- Test 122 -----------------------------" >>testtrygrep +echo "---------------------------- Test 123 -----------------------------" >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -w 'dog|cat' testdata/grepinputv) >>testtrygrep echo "RC=$?" >>testtrygrep +echo "---------------------------- Test 124 -----------------------------" >>testtrygrep +(cd $srcdir; $valgrind $vjs $pcre2grep -Mn --colour=always 'start[\s]+end' testdata/grepinputM) >>testtrygrep +echo "RC=$?" >>testtrygrep +(cd $srcdir; $valgrind $vjs $pcre2grep -Mn --colour=always -A2 'start[\s]+end' testdata/grepinputM) >>testtrygrep +echo "RC=$?" >>testtrygrep +(cd $srcdir; $valgrind $vjs $pcre2grep -Mn 'start[\s]+end' testdata/grepinputM) >>testtrygrep +echo "RC=$?" >>testtrygrep +(cd $srcdir; $valgrind $vjs $pcre2grep -Mn -A2 'start[\s]+end' testdata/grepinputM) >>testtrygrep +echo "RC=$?" >>testtrygrep + # Now compare the results. diff --git a/src/pcre2grep.c b/src/pcre2grep.c index 1649d5a..bec07e1 100644 --- a/src/pcre2grep.c +++ b/src/pcre2grep.c @@ -2505,7 +2505,10 @@ while (ptr < endptr) match = match_patterns(ptr, length, options, startoffset, &mrc); options = PCRE2_NOTEMPTY; - /* If it's a match or a not-match (as required), do what's wanted. */ + /* If it's a match or a not-match (as required), do what's wanted. NOTE: Use + only FWRITE_IGNORE() - which is just a packaged fwrite() that ignores its + return code - to output data lines, so that binary zeroes are treated as just + another data character. */ if (match != invert) { @@ -2734,27 +2737,6 @@ while (ptr < endptr) if (printname != NULL) fprintf(stdout, "%s:", printname); if (number) fprintf(stdout, "%d:", linenumber); - /* In multiline mode, we want to print to the end of the line in which - the end of the matched string is found, so we adjust linelength and the - line number appropriately, but only when there actually was a match - (invert not set). Because the PCRE2_FIRSTLINE option is set, the start of - the match will always be before the first newline sequence. */ - - if (multiline & !invert) - { - char *endmatch = ptr + offsets[1]; - t = ptr; - while (t <= endmatch) - { - t = end_of_line(t, endptr, &endlinelength); - if (t < endmatch) linenumber++; else break; - } - linelength = t - ptr - endlinelength; - } - - /*** NOTE: Use only fwrite() to output the data line, so that binary - zeroes are treated as just another data character. */ - /* This extra option, for Jeffrey Friedl's debugging requirements, replaces the matched string, or a specific captured string if it exists, with X. When this happens, colouring is ignored. */ @@ -2771,20 +2753,48 @@ while (ptr < endptr) else #endif - /* We have to split the line(s) up if colouring, and search for further - matches, but not of course if the line is a non-match. */ + /* In multiline mode, or if colouring, we have to split the line(s) up + and search for further matches, but not of course if the line is a + non-match. In multiline mode this is necessary in case there is another + match that spans the end of the current line. When colouring we want to + colour all matches. */ - if (do_colour && !invert) + if ((multiline || do_colour) && !invert) { int plength; FWRITE_IGNORE(ptr, 1, offsets[0], stdout); print_match(ptr + offsets[0], offsets[1] - offsets[0]); for (;;) { - startoffset = offsets[1]; - if (startoffset >= linelength + endlinelength || - !match_patterns(ptr, length, options, startoffset, &mrc)) - break; + startoffset = offsets[1]; /* Advance after previous match. */ + + /* If the current match ended past the end of the line (only possible + in multiline mode), we must move on to the line in which it did end + before searching for more matches. Because the PCRE2_FIRSTLINE option + is set, the start of the match will always be before the first + newline sequence. */ + + while (startoffset > linelength + endlinelength) + { + ptr += linelength + endlinelength; + filepos += (int)(linelength + endlinelength); + linenumber++; + startoffset -= (int)(linelength + endlinelength); + t = end_of_line(ptr, endptr, &endlinelength); + linelength = t - ptr - endlinelength; + length = (size_t)(endptr - ptr); + } + + /* If startoffset is at the exact end of the line it means this + complete line was the final part of the match, so there is nothing + more to do. */ + + if (startoffset == linelength + endlinelength) break; + + /* Otherwise, run a match from within the final line, and if found, + loop for any that may follow. */ + + if (!match_patterns(ptr, length, options, startoffset, &mrc)) break; FWRITE_IGNORE(ptr + startoffset, 1, offsets[0] - startoffset, stdout); print_match(ptr + offsets[0], offsets[1] - offsets[0]); } @@ -2797,7 +2807,7 @@ while (ptr < endptr) if (plength > 0) FWRITE_IGNORE(ptr + startoffset, 1, plength, stdout); } - /* Not colouring; no need to search for further matches */ + /* Not colouring or multiline; no need to search for further matches. */ else FWRITE_IGNORE(ptr, 1, linelength + endlinelength, stdout); } diff --git a/testdata/grepinputM b/testdata/grepinputM new file mode 100644 index 0000000..9119e3d --- /dev/null +++ b/testdata/grepinputM @@ -0,0 +1,17 @@ +Data file for multiline tests of multiple matches. + +start end in between start +end and following +Other stuff + +start end in between start +end and following start +end other stuff + +start end in between start + +end + +** These two lines must be last. +start end in between start +end diff --git a/testdata/grepoutput b/testdata/grepoutput index 52e0d17..7e963fb 100644 --- a/testdata/grepoutput +++ b/testdata/grepoutput @@ -487,6 +487,7 @@ RC=0 ./testdata/grepinput:456 ./testdata/grepinput3:0 ./testdata/grepinput8:0 +./testdata/grepinputM:0 ./testdata/grepinputv:1 ./testdata/grepinputx:0 RC=0 @@ -600,6 +601,33 @@ RC=0 triple: t6_txt s2_tag s_txt p_tag p_txt o_tag o_txt RC=0 +1:triple: t1_txt s1_tag s_txt p_tag p_txt o_tag o_txt + +6:triple: t3_txt s2_tag s_txt p_tag p_txt o_tag o_txt + +8:triple: t4_txt s1_tag s_txt p_tag p_txt o_tag o_txt + +13:triple: t6_txt s2_tag s_txt p_tag p_txt o_tag o_txt + +RC=0 +triple: t1_txt s1_tag s_txt p_tag p_txt o_tag o_txt + +triple: t3_txt s2_tag s_txt p_tag p_txt o_tag o_txt + +triple: t4_txt s1_tag s_txt p_tag p_txt o_tag o_txt + +triple: t6_txt s2_tag s_txt p_tag p_txt o_tag o_txt + +RC=0 +1:triple: t1_txt s1_tag s_txt p_tag p_txt o_tag o_txt + +6:triple: t3_txt s2_tag s_txt p_tag p_txt o_tag o_txt + +8:triple: t4_txt s1_tag s_txt p_tag p_txt o_tag o_txt + +13:triple: t6_txt s2_tag s_txt p_tag p_txt o_tag o_txt + +RC=0 ---------------------------- Test 71 ----------------------------- 01 RC=0 @@ -793,21 +821,23 @@ RC=0 37216,12 RC=0 ---------------------------- Test 113 ----------------------------- -478 +480 RC=0 ---------------------------- Test 114 ----------------------------- testdata/grepinput:469 testdata/grepinput3:0 testdata/grepinput8:0 +testdata/grepinputM:2 testdata/grepinputv:3 testdata/grepinputx:6 -TOTAL:478 +TOTAL:480 RC=0 ---------------------------- Test 115 ----------------------------- testdata/grepinput:469 +testdata/grepinputM:2 testdata/grepinputv:3 testdata/grepinputx:6 -TOTAL:478 +TOTAL:480 RC=0 ---------------------------- Test 116 ----------------------------- 478 @@ -816,9 +846,10 @@ RC=0 469 0 0 +2 3 6 -478 +480 RC=0 ---------------------------- Test 118 ----------------------------- testdata/grepinput3 @@ -846,7 +877,62 @@ RC=0 over the lazy dog. The word is cat in this line RC=0 ----------------------------- Test 122 ----------------------------- +---------------------------- Test 123 ----------------------------- over the lazy dog. The word is cat in this line RC=0 +---------------------------- Test 124 ----------------------------- +3:start end in between start +end and following +7:start end in between start +end and following start +end other stuff +11:start end in between start + +end +16:start end in between start +end +RC=0 +3:start end in between start +end and following +5-Other stuff +6- +7:start end in between start +end and following start +end other stuff +10- +11:start end in between start + +end +14- +15-** These two lines must be last. +16:start end in between start +end +RC=0 +3:start end in between start +end and following +7:start end in between start +end and following start +end other stuff +11:start end in between start + +end +16:start end in between start +end +RC=0 +3:start end in between start +end and following +5-Other stuff +6- +7:start end in between start +end and following start +end other stuff +10- +11:start end in between start + +end +14- +15-** These two lines must be last. +16:start end in between start +end +RC=0 -- 2.13.6