Fix multi-line matching in pcre2grep tool

This commit is contained in:
Petr Písař 2017-11-13 18:22:44 +01:00
parent f49a98f096
commit 49359ae6d0
3 changed files with 499 additions and 1 deletions

View File

@ -0,0 +1,383 @@
From eff8c9e117259b1192919b85c2ee03a27b164f1a Mon Sep 17 00:00:00 2001
From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
Date: Mon, 13 Nov 2017 16:52:39 +0000
Subject: [PATCH] Fix multiple multiline matching issues in pcre2grep.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@878 6239d852-aaf2-0410-a92c-79f79f948069
Petr Písař: Ported to 10.30.
diff --git a/RunGrepTest b/RunGrepTest
index cf88c78..a7496cb 100755
--- a/RunGrepTest
+++ b/RunGrepTest
@@ -248,7 +248,7 @@ echo "---------------------------- Test 35 -----------------------------" >>test
echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 36 -----------------------------" >>testtrygrep
-(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include=grepinput --exclude 'grepinput$' --exclude=grepinput8 --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include=grepinput --exclude 'grepinput$' --exclude=grepinput8 --exclude=grepinputM --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep
echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 37 -----------------------------" >>testtrygrep
@@ -391,6 +391,12 @@ echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 70 -----------------------------" >>testtrygrep
(cd $srcdir; $valgrind $vjs $pcre2grep --color=always -M "triple:\t.*\n\n" ./testdata/grepinput3) >>testtrygrep
echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep --color=always -M -n "triple:\t.*\n\n" ./testdata/grepinput3) >>testtrygrep
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -M "triple:\t.*\n\n" ./testdata/grepinput3) >>testtrygrep
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -M -n "triple:\t.*\n\n" ./testdata/grepinput3) >>testtrygrep
+echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 71 -----------------------------" >>testtrygrep
(cd $srcdir; $valgrind $vjs $pcre2grep -o "^01|^02|^03" ./testdata/grepinput) >>testtrygrep
@@ -494,25 +500,25 @@ echo "---------------------------- Test 95 -----------------------------" >>test
echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 96 -----------------------------" >>testtrygrep
-(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include-dir=testdata --exclude '^(?!grepinput)' 'fox' ./test* | sort) >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include-dir=testdata --exclude '^(?!grepinput)' --exclude=grepinputM 'fox' ./test* | sort) >>testtrygrep
echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 97 -----------------------------" >>testtrygrep
echo "grepinput$" >testtemp1grep
echo "grepinput8" >>testtemp1grep
-(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include=grepinput --exclude-from $builddir/testtemp1grep --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include=grepinput --exclude=grepinputM --exclude-from $builddir/testtemp1grep --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep
echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 98 -----------------------------" >>testtrygrep
echo "grepinput$" >testtemp1grep
echo "grepinput8" >>testtemp1grep
-(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --exclude=grepinput3 --include=grepinput --exclude-from $builddir/testtemp1grep --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --exclude=grepinput3 --exclude=grepinputM --include=grepinput --exclude-from $builddir/testtemp1grep --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep
echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 99 -----------------------------" >>testtrygrep
echo "grepinput$" >testtemp1grep
echo "grepinput8" >testtemp2grep
-(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include grepinput --exclude-from $builddir/testtemp1grep --exclude-from=$builddir/testtemp2grep --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -L -r --include grepinput --exclude=grepinputM --exclude-from $builddir/testtemp1grep --exclude-from=$builddir/testtemp2grep --exclude-dir='^\.' 'fox' ./testdata | sort) >>testtrygrep
echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 100 ------------------------------" >>testtrygrep
@@ -582,7 +588,7 @@ echo "---------------------------- Test 115 -----------------------------" >>tes
echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 116 -----------------------------" >>testtrygrep
-(cd $srcdir; $valgrind $vjs $pcre2grep -th 'the' testdata/grepinput*) >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep --exclude=grepinputM -th 'the' testdata/grepinput*) >>testtrygrep
echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 117 -----------------------------" >>testtrygrep
@@ -610,10 +616,20 @@ echo "---------------------------- Test 122 -----------------------------" >>tes
(cd $srcdir; $valgrind $vjs $pcre2grep -w 'cat|dog' testdata/grepinputv) >>testtrygrep
echo "RC=$?" >>testtrygrep
-echo "---------------------------- Test 122 -----------------------------" >>testtrygrep
+echo "---------------------------- Test 123 -----------------------------" >>testtrygrep
(cd $srcdir; $valgrind $vjs $pcre2grep -w 'dog|cat' testdata/grepinputv) >>testtrygrep
echo "RC=$?" >>testtrygrep
+echo "---------------------------- Test 124 -----------------------------" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -Mn --colour=always 'start[\s]+end' testdata/grepinputM) >>testtrygrep
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -Mn --colour=always -A2 'start[\s]+end' testdata/grepinputM) >>testtrygrep
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -Mn 'start[\s]+end' testdata/grepinputM) >>testtrygrep
+echo "RC=$?" >>testtrygrep
+(cd $srcdir; $valgrind $vjs $pcre2grep -Mn -A2 'start[\s]+end' testdata/grepinputM) >>testtrygrep
+echo "RC=$?" >>testtrygrep
+
# Now compare the results.
diff --git a/src/pcre2grep.c b/src/pcre2grep.c
index 1649d5a..bec07e1 100644
--- a/src/pcre2grep.c
+++ b/src/pcre2grep.c
@@ -2505,7 +2505,10 @@ while (ptr < endptr)
match = match_patterns(ptr, length, options, startoffset, &mrc);
options = PCRE2_NOTEMPTY;
- /* If it's a match or a not-match (as required), do what's wanted. */
+ /* If it's a match or a not-match (as required), do what's wanted. NOTE: Use
+ only FWRITE_IGNORE() - which is just a packaged fwrite() that ignores its
+ return code - to output data lines, so that binary zeroes are treated as just
+ another data character. */
if (match != invert)
{
@@ -2734,27 +2737,6 @@ while (ptr < endptr)
if (printname != NULL) fprintf(stdout, "%s:", printname);
if (number) fprintf(stdout, "%d:", linenumber);
- /* In multiline mode, we want to print to the end of the line in which
- the end of the matched string is found, so we adjust linelength and the
- line number appropriately, but only when there actually was a match
- (invert not set). Because the PCRE2_FIRSTLINE option is set, the start of
- the match will always be before the first newline sequence. */
-
- if (multiline & !invert)
- {
- char *endmatch = ptr + offsets[1];
- t = ptr;
- while (t <= endmatch)
- {
- t = end_of_line(t, endptr, &endlinelength);
- if (t < endmatch) linenumber++; else break;
- }
- linelength = t - ptr - endlinelength;
- }
-
- /*** NOTE: Use only fwrite() to output the data line, so that binary
- zeroes are treated as just another data character. */
-
/* This extra option, for Jeffrey Friedl's debugging requirements,
replaces the matched string, or a specific captured string if it exists,
with X. When this happens, colouring is ignored. */
@@ -2771,20 +2753,48 @@ while (ptr < endptr)
else
#endif
- /* We have to split the line(s) up if colouring, and search for further
- matches, but not of course if the line is a non-match. */
+ /* In multiline mode, or if colouring, we have to split the line(s) up
+ and search for further matches, but not of course if the line is a
+ non-match. In multiline mode this is necessary in case there is another
+ match that spans the end of the current line. When colouring we want to
+ colour all matches. */
- if (do_colour && !invert)
+ if ((multiline || do_colour) && !invert)
{
int plength;
FWRITE_IGNORE(ptr, 1, offsets[0], stdout);
print_match(ptr + offsets[0], offsets[1] - offsets[0]);
for (;;)
{
- startoffset = offsets[1];
- if (startoffset >= linelength + endlinelength ||
- !match_patterns(ptr, length, options, startoffset, &mrc))
- break;
+ startoffset = offsets[1]; /* Advance after previous match. */
+
+ /* If the current match ended past the end of the line (only possible
+ in multiline mode), we must move on to the line in which it did end
+ before searching for more matches. Because the PCRE2_FIRSTLINE option
+ is set, the start of the match will always be before the first
+ newline sequence. */
+
+ while (startoffset > linelength + endlinelength)
+ {
+ ptr += linelength + endlinelength;
+ filepos += (int)(linelength + endlinelength);
+ linenumber++;
+ startoffset -= (int)(linelength + endlinelength);
+ t = end_of_line(ptr, endptr, &endlinelength);
+ linelength = t - ptr - endlinelength;
+ length = (size_t)(endptr - ptr);
+ }
+
+ /* If startoffset is at the exact end of the line it means this
+ complete line was the final part of the match, so there is nothing
+ more to do. */
+
+ if (startoffset == linelength + endlinelength) break;
+
+ /* Otherwise, run a match from within the final line, and if found,
+ loop for any that may follow. */
+
+ if (!match_patterns(ptr, length, options, startoffset, &mrc)) break;
FWRITE_IGNORE(ptr + startoffset, 1, offsets[0] - startoffset, stdout);
print_match(ptr + offsets[0], offsets[1] - offsets[0]);
}
@@ -2797,7 +2807,7 @@ while (ptr < endptr)
if (plength > 0) FWRITE_IGNORE(ptr + startoffset, 1, plength, stdout);
}
- /* Not colouring; no need to search for further matches */
+ /* Not colouring or multiline; no need to search for further matches. */
else FWRITE_IGNORE(ptr, 1, linelength + endlinelength, stdout);
}
diff --git a/testdata/grepinputM b/testdata/grepinputM
new file mode 100644
index 0000000..9119e3d
--- /dev/null
+++ b/testdata/grepinputM
@@ -0,0 +1,17 @@
+Data file for multiline tests of multiple matches.
+
+start end in between start
+end and following
+Other stuff
+
+start end in between start
+end and following start
+end other stuff
+
+start end in between start
+
+end
+
+** These two lines must be last.
+start end in between start
+end
diff --git a/testdata/grepoutput b/testdata/grepoutput
index 52e0d17..7e963fb 100644
--- a/testdata/grepoutput
+++ b/testdata/grepoutput
@@ -487,6 +487,7 @@ RC=0
./testdata/grepinput:456
./testdata/grepinput3:0
./testdata/grepinput8:0
+./testdata/grepinputM:0
./testdata/grepinputv:1
./testdata/grepinputx:0
RC=0
@@ -600,6 +601,33 @@ RC=0
triple: t6_txt s2_tag s_txt p_tag p_txt o_tag o_txt
RC=0
+1:triple: t1_txt s1_tag s_txt p_tag p_txt o_tag o_txt
+
+6:triple: t3_txt s2_tag s_txt p_tag p_txt o_tag o_txt
+
+8:triple: t4_txt s1_tag s_txt p_tag p_txt o_tag o_txt
+
+13:triple: t6_txt s2_tag s_txt p_tag p_txt o_tag o_txt
+
+RC=0
+triple: t1_txt s1_tag s_txt p_tag p_txt o_tag o_txt
+
+triple: t3_txt s2_tag s_txt p_tag p_txt o_tag o_txt
+
+triple: t4_txt s1_tag s_txt p_tag p_txt o_tag o_txt
+
+triple: t6_txt s2_tag s_txt p_tag p_txt o_tag o_txt
+
+RC=0
+1:triple: t1_txt s1_tag s_txt p_tag p_txt o_tag o_txt
+
+6:triple: t3_txt s2_tag s_txt p_tag p_txt o_tag o_txt
+
+8:triple: t4_txt s1_tag s_txt p_tag p_txt o_tag o_txt
+
+13:triple: t6_txt s2_tag s_txt p_tag p_txt o_tag o_txt
+
+RC=0
---------------------------- Test 71 -----------------------------
01
RC=0
@@ -793,21 +821,23 @@ RC=0
37216,12
RC=0
---------------------------- Test 113 -----------------------------
-478
+480
RC=0
---------------------------- Test 114 -----------------------------
testdata/grepinput:469
testdata/grepinput3:0
testdata/grepinput8:0
+testdata/grepinputM:2
testdata/grepinputv:3
testdata/grepinputx:6
-TOTAL:478
+TOTAL:480
RC=0
---------------------------- Test 115 -----------------------------
testdata/grepinput:469
+testdata/grepinputM:2
testdata/grepinputv:3
testdata/grepinputx:6
-TOTAL:478
+TOTAL:480
RC=0
---------------------------- Test 116 -----------------------------
478
@@ -816,9 +846,10 @@ RC=0
469
0
0
+2
3
6
-478
+480
RC=0
---------------------------- Test 118 -----------------------------
testdata/grepinput3
@@ -846,7 +877,62 @@ RC=0
over the lazy dog.
The word is cat in this line
RC=0
----------------------------- Test 122 -----------------------------
+---------------------------- Test 123 -----------------------------
over the lazy dog.
The word is cat in this line
RC=0
+---------------------------- Test 124 -----------------------------
+3:start end in between start
+end and following
+7:start end in between start
+end and following start
+end other stuff
+11:start end in between start
+
+end
+16:start end in between start
+end
+RC=0
+3:start end in between start
+end and following
+5-Other stuff
+6-
+7:start end in between start
+end and following start
+end other stuff
+10-
+11:start end in between start
+
+end
+14-
+15-** These two lines must be last.
+16:start end in between start
+end
+RC=0
+3:start end in between start
+end and following
+7:start end in between start
+end and following start
+end other stuff
+11:start end in between start
+
+end
+16:start end in between start
+end
+RC=0
+3:start end in between start
+end and following
+5-Other stuff
+6-
+7:start end in between start
+end and following start
+end other stuff
+10-
+11:start end in between start
+
+end
+14-
+15-** These two lines must be last.
+16:start end in between start
+end
+RC=0
--
2.13.6

View File

@ -0,0 +1,105 @@
From 5e964db12e04a84c9b74751d5c22ae4ff4bb416c Mon Sep 17 00:00:00 2001
From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
Date: Thu, 9 Nov 2017 17:50:59 +0000
Subject: [PATCH] Remove superflous variable.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@877 6239d852-aaf2-0410-a92c-79f79f948069
Signed-off-by: Petr Písař <ppisar@redhat.com>
---
src/pcre2grep.c | 21 ++++++++++-----------
1 file changed, 10 insertions(+), 11 deletions(-)
diff --git a/src/pcre2grep.c b/src/pcre2grep.c
index d75917c..1649d5a 100644
--- a/src/pcre2grep.c
+++ b/src/pcre2grep.c
@@ -2365,7 +2365,6 @@ while (ptr < endptr)
int mrc = 0;
unsigned int options = 0;
BOOL match;
- char *matchptr = ptr;
char *t = ptr;
size_t length, linelength;
size_t startoffset = 0;
@@ -2503,7 +2502,7 @@ while (ptr < endptr)
match, set PCRE2_NOTEMPTY to disable any further matches of null strings in
this line. */
- match = match_patterns(matchptr, length, options, startoffset, &mrc);
+ match = match_patterns(ptr, length, options, startoffset, &mrc);
options = PCRE2_NOTEMPTY;
/* If it's a match or a not-match (as required), do what's wanted. */
@@ -2564,14 +2563,14 @@ while (ptr < endptr)
/* Handle --line-offsets */
if (line_offsets)
- fprintf(stdout, "%d,%d" STDOUT_NL, (int)(matchptr + offsets[0] - ptr),
+ fprintf(stdout, "%d,%d" STDOUT_NL, (int)(ptr + offsets[0] - ptr),
(int)(offsets[1] - offsets[0]));
/* Handle --file-offsets */
else if (file_offsets)
fprintf(stdout, "%d,%d" STDOUT_NL,
- (int)(filepos + matchptr + offsets[0] - ptr),
+ (int)(filepos + ptr + offsets[0] - ptr),
(int)(offsets[1] - offsets[0]));
/* Handle --output (which has already been syntax checked) */
@@ -2579,7 +2578,7 @@ while (ptr < endptr)
else if (output_text != NULL)
{
if (display_output_text((PCRE2_SPTR)output_text, FALSE,
- (PCRE2_SPTR)matchptr, offsets, mrc) || printname != NULL ||
+ (PCRE2_SPTR)ptr, offsets, mrc) || printname != NULL ||
number)
fprintf(stdout, STDOUT_NL);
}
@@ -2601,7 +2600,7 @@ while (ptr < endptr)
{
if (printed && om_separator != NULL)
fprintf(stdout, "%s", om_separator);
- print_match(matchptr + offsets[n*2], plen);
+ print_match(ptr + offsets[n*2], plen);
printed = TRUE;
}
}
@@ -2628,7 +2627,7 @@ while (ptr < endptr)
{
if (startoffset >= length) goto END_ONE_MATCH; /* Were at end */
startoffset = oldstartoffset + 1;
- if (utf) while ((matchptr[startoffset] & 0xc0) == 0x80) startoffset++;
+ if (utf) while ((ptr[startoffset] & 0xc0) == 0x80) startoffset++;
}
/* If the current match ended past the end of the line (only possible
@@ -2637,7 +2636,7 @@ while (ptr < endptr)
while (startoffset > linelength)
{
- matchptr = ptr += linelength + endlinelength;
+ ptr += linelength + endlinelength;
filepos += (int)(linelength + endlinelength);
linenumber++;
startoffset -= (int)(linelength + endlinelength);
@@ -2784,10 +2783,10 @@ while (ptr < endptr)
{
startoffset = offsets[1];
if (startoffset >= linelength + endlinelength ||
- !match_patterns(matchptr, length, options, startoffset, &mrc))
+ !match_patterns(ptr, length, options, startoffset, &mrc))
break;
- FWRITE_IGNORE(matchptr + startoffset, 1, offsets[0] - startoffset, stdout);
- print_match(matchptr + offsets[0], offsets[1] - offsets[0]);
+ FWRITE_IGNORE(ptr + startoffset, 1, offsets[0] - startoffset, stdout);
+ print_match(ptr + offsets[0], offsets[1] - offsets[0]);
}
/* In multiline mode, we may have already printed the complete line
--
2.13.6

View File

@ -6,7 +6,7 @@
#%%global rcversion RC1 #%%global rcversion RC1
Name: pcre2 Name: pcre2
Version: 10.30 Version: 10.30
Release: %{?rcversion:0.}2%{?rcversion:.%rcversion}%{?dist} Release: %{?rcversion:0.}3%{?rcversion:.%rcversion}%{?dist}
%global myversion %{version}%{?rcversion:-%rcversion} %global myversion %{version}%{?rcversion:-%rcversion}
Summary: Perl-compatible regular expression library Summary: Perl-compatible regular expression library
# the library: BSD with exceptions # the library: BSD with exceptions
@ -48,6 +48,11 @@ Patch1: pcre2-10.30-Fix-pcre2grep-recursive-file-name-length-issue.patch
# 2/2 Accept files names longer than 128 bytes in recursive mode of pcre2grep, # 2/2 Accept files names longer than 128 bytes in recursive mode of pcre2grep,
# upstream bug #2177, in upstream after 10.30 # upstream bug #2177, in upstream after 10.30
Patch2: pcre2-10.30-Fix-memory-leak-issue-introduced-in-last-bug-fix-in-.patch Patch2: pcre2-10.30-Fix-memory-leak-issue-introduced-in-last-bug-fix-in-.patch
# Required for Fix-multiple-multiline-matching-issues-in-pcre2grep.patch
Patch3: pcre2-10.30-Remove-superflous-variable.patch
# Fix multi-line matching in pcre2grep tool, upstream bug #2187,
# in upstream after 10.30
Patch4: pcre2-10.30-Fix-multiple-multiline-matching-issues-in-pcre2grep.patch
BuildRequires: autoconf BuildRequires: autoconf
BuildRequires: automake BuildRequires: automake
BuildRequires: coreutils BuildRequires: coreutils
@ -124,6 +129,8 @@ Utilities demonstrating PCRE2 capabilities like pcre2grep or pcre2test.
%patch0 -p1 %patch0 -p1
%patch1 -p1 %patch1 -p1
%patch2 -p1 %patch2 -p1
%patch3 -p1
%patch4 -p1
# Because of multilib patch # Because of multilib patch
libtoolize --copy --force libtoolize --copy --force
autoreconf -vif autoreconf -vif
@ -227,6 +234,9 @@ make %{?_smp_mflags} check VERBOSE=yes
%{_mandir}/man1/pcre2test.* %{_mandir}/man1/pcre2test.*
%changelog %changelog
* Mon Nov 13 2017 Petr Pisar <ppisar@redhat.com> - 10.30-3
- Fix multi-line matching in pcre2grep tool (upstream bug #2187)
* Thu Nov 02 2017 Petr Pisar <ppisar@redhat.com> - 10.30-2 * Thu Nov 02 2017 Petr Pisar <ppisar@redhat.com> - 10.30-2
- Accept files names longer than 128 bytes in recursive mode of pcre2grep - Accept files names longer than 128 bytes in recursive mode of pcre2grep
(upstream bug #2177) (upstream bug #2177)