Fix handling \K in an assertion in pcre2grep tool and documenantion

This commit is contained in:
Petr Písař 2018-01-12 12:21:50 +01:00
parent f4e05051d1
commit 6d626f9a4d
4 changed files with 436 additions and 1 deletions

View File

@ -0,0 +1,174 @@
From c26d49eda45dd8a26e1de65a4430e84116266227 Mon Sep 17 00:00:00 2001
From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
Date: Sun, 24 Dec 2017 10:27:13 +0000
Subject: [PATCH 2/3] Documentation update.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@895 6239d852-aaf2-0410-a92c-79f79f948069
Signed-off-by: Petr Písař <ppisar@redhat.com>
---
doc/html/pcre2demo.html | 53 +++++++++++++++++++++++++++++++++++++++++++++++++
doc/pcre2demo.3 | 53 +++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 106 insertions(+)
diff --git a/doc/html/pcre2demo.html b/doc/html/pcre2demo.html
index d64e16b..72754d3 100644
--- a/doc/html/pcre2demo.html
+++ b/doc/html/pcre2demo.html
@@ -228,6 +228,21 @@ pcre2_match_data_create_from_pattern() above. */
if (rc == 0)
printf("ovector was not big enough for all the captured substrings\n");
+/* We must guard against patterns such as /(?=.\K)/ that use \K in an assertion
+to set the start of a match later than its end. In this demonstration program,
+we just detect this case and give up. */
+
+if (ovector[0] &gt; ovector[1])
+ {
+ printf("\\K was used in an assertion to set the match start after its end.\n"
+ "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]),
+ (char *)(subject + ovector[1]));
+ printf("Run abandoned\n");
+ pcre2_match_data_free(match_data);
+ pcre2_code_free(re);
+ return 1;
+ }
+
/* Show substrings stored in the output vector by number. Obviously, in a real
application you might want to do things other than print them. */
@@ -355,6 +370,29 @@ for (;;)
options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
}
+ /* If the previous match was not an empty string, there is one tricky case to
+ consider. If a pattern contains \K within a lookbehind assertion at the
+ start, the end of the matched string can be at the offset where the match
+ started. Without special action, this leads to a loop that keeps on matching
+ the same substring. We must detect this case and arrange to move the start on
+ by one character. The pcre2_get_startchar() function returns the starting
+ offset that was passed to pcre2_match(). */
+
+ else
+ {
+ PCRE2_SIZE startchar = pcre2_get_startchar(match_data);
+ if (start_offset &lt;= startchar)
+ {
+ if (startchar &gt;= subject_length) break; /* Reached end of subject. */
+ start_offset = startchar + 1; /* Advance by one character. */
+ if (utf8) /* If UTF-8, it may be more */
+ { /* than one code unit. */
+ for (; start_offset &lt; subject_length; start_offset++)
+ if ((subject[start_offset] &amp; 0xc0) != 0x80) break;
+ }
+ }
+ }
+
/* Run the next matching operation */
rc = pcre2_match(
@@ -419,6 +457,21 @@ for (;;)
if (rc == 0)
printf("ovector was not big enough for all the captured substrings\n");
+ /* We must guard against patterns such as /(?=.\K)/ that use \K in an
+ assertion to set the start of a match later than its end. In this
+ demonstration program, we just detect this case and give up. */
+
+ if (ovector[0] &gt; ovector[1])
+ {
+ printf("\\K was used in an assertion to set the match start after its end.\n"
+ "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]),
+ (char *)(subject + ovector[1]));
+ printf("Run abandoned\n");
+ pcre2_match_data_free(match_data);
+ pcre2_code_free(re);
+ return 1;
+ }
+
/* As before, show substrings stored in the output vector by number, and then
also any named substrings. */
diff --git a/doc/pcre2demo.3 b/doc/pcre2demo.3
index c02dcd9..a9e58e2 100644
--- a/doc/pcre2demo.3
+++ b/doc/pcre2demo.3
@@ -228,6 +228,21 @@ pcre2_match_data_create_from_pattern() above. */
if (rc == 0)
printf("ovector was not big enough for all the captured substrings\en");
+/* We must guard against patterns such as /(?=.\eK)/ that use \eK in an assertion
+to set the start of a match later than its end. In this demonstration program,
+we just detect this case and give up. */
+
+if (ovector[0] > ovector[1])
+ {
+ printf("\e\eK was used in an assertion to set the match start after its end.\en"
+ "From end to start the match was: %.*s\en", (int)(ovector[0] - ovector[1]),
+ (char *)(subject + ovector[1]));
+ printf("Run abandoned\en");
+ pcre2_match_data_free(match_data);
+ pcre2_code_free(re);
+ return 1;
+ }
+
/* Show substrings stored in the output vector by number. Obviously, in a real
application you might want to do things other than print them. */
@@ -355,6 +370,29 @@ for (;;)
options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
}
+ /* If the previous match was not an empty string, there is one tricky case to
+ consider. If a pattern contains \eK within a lookbehind assertion at the
+ start, the end of the matched string can be at the offset where the match
+ started. Without special action, this leads to a loop that keeps on matching
+ the same substring. We must detect this case and arrange to move the start on
+ by one character. The pcre2_get_startchar() function returns the starting
+ offset that was passed to pcre2_match(). */
+
+ else
+ {
+ PCRE2_SIZE startchar = pcre2_get_startchar(match_data);
+ if (start_offset <= startchar)
+ {
+ if (startchar >= subject_length) break; /* Reached end of subject. */
+ start_offset = startchar + 1; /* Advance by one character. */
+ if (utf8) /* If UTF-8, it may be more */
+ { /* than one code unit. */
+ for (; start_offset < subject_length; start_offset++)
+ if ((subject[start_offset] & 0xc0) != 0x80) break;
+ }
+ }
+ }
+
/* Run the next matching operation */
rc = pcre2_match(
@@ -419,6 +457,21 @@ for (;;)
if (rc == 0)
printf("ovector was not big enough for all the captured substrings\en");
+ /* We must guard against patterns such as /(?=.\eK)/ that use \eK in an
+ assertion to set the start of a match later than its end. In this
+ demonstration program, we just detect this case and give up. */
+
+ if (ovector[0] > ovector[1])
+ {
+ printf("\e\eK was used in an assertion to set the match start after its end.\en"
+ "From end to start the match was: %.*s\en", (int)(ovector[0] - ovector[1]),
+ (char *)(subject + ovector[1]));
+ printf("Run abandoned\en");
+ pcre2_match_data_free(match_data);
+ pcre2_code_free(re);
+ return 1;
+ }
+
/* As before, show substrings stored in the output vector by number, and then
also any named substrings. */
--
2.13.6

View File

@ -0,0 +1,149 @@
From a109c9e35a040a7a8032c12ce7396bc949f3f735 Mon Sep 17 00:00:00 2001
From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
Date: Tue, 26 Dec 2017 15:10:04 +0000
Subject: [PATCH 3/3] Fix \K issues in pcre2grep.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@896 6239d852-aaf2-0410-a92c-79f79f948069
Petr Písař: Ported to 10.30.
---
RunGrepTest | 11 +++++++++++
src/pcre2grep.c | 49 ++++++++++++++++++++++++++++++++++++++++++++-----
testdata/grepoutput | 9 +++++++++
diff --git a/RunGrepTest b/RunGrepTest
index a7496cb..a26f677 100755
--- a/RunGrepTest
+++ b/RunGrepTest
@@ -630,6 +630,17 @@ echo "RC=$?" >>testtrygrep
(cd $srcdir; $valgrind $vjs $pcre2grep -Mn -A2 'start[\s]+end' testdata/grepinputM) >>testtrygrep
echo "RC=$?" >>testtrygrep
+echo "---------------------------- Test 125 -----------------------------" >>testtrygrep
+printf "abcd\n" >testNinputgrep
+$valgrind $vjs $pcre2grep --colour=always '(?<=\K.)' testNinputgrep >>testtrygrep
+echo "RC=$?" >>testtrygrep
+$valgrind $vjs $pcre2grep --colour=always '(?=.\K)' testNinputgrep >>testtrygrep
+echo "RC=$?" >>testtrygrep
+$valgrind $vjs $pcre2grep --colour=always '(?<=\K[ac])' testNinputgrep >>testtrygrep
+echo "RC=$?" >>testtrygrep
+$valgrind $vjs $pcre2grep --colour=always '(?=[ac]\K)' testNinputgrep >>testtrygrep
+echo "RC=$?" >>testtrygrep
+
# Now compare the results.
diff --git a/src/pcre2grep.c b/src/pcre2grep.c
index e764313..02339f5 100644
--- a/src/pcre2grep.c
+++ b/src/pcre2grep.c
@@ -1607,7 +1607,7 @@ Returns: nothing
*/
static void
-do_after_lines(unsigned long int lastmatchnumber, char *lastmatchrestart,
+do_after_lines(unsigned long int lastmatchnumber, char *lastmatchrestart,
char *endptr, const char *printname)
{
if (after_context > 0 && lastmatchnumber > 0)
@@ -2764,11 +2764,38 @@ while (ptr < endptr)
if ((multiline || do_colour) && !invert)
{
int plength;
+ PCRE2_SIZE endprevious;
+
+ /* The use of \K may make the end offset earlier than the start. In
+ this situation, swap them round. */
+
+ if (offsets[0] > offsets[1])
+ {
+ PCRE2_SIZE temp = offsets[0];
+ offsets[0] = offsets[1];
+ offsets[1] = temp;
+ }
+
FWRITE_IGNORE(ptr, 1, offsets[0], stdout);
print_match(ptr + offsets[0], offsets[1] - offsets[0]);
+
for (;;)
{
- startoffset = offsets[1]; /* Advance after previous match. */
+ PCRE2_SIZE oldstartoffset = pcre2_get_startchar(match_data);
+
+ endprevious = offsets[1];
+ startoffset = endprevious; /* Advance after previous match. */
+
+ /* If the pattern contained a lookbehind that included \K, it is
+ possible that the end of the match might be at or before the actual
+ starting offset we have just used. In this case, start one character
+ further on. */
+
+ if (startoffset <= oldstartoffset)
+ {
+ startoffset = oldstartoffset + 1;
+ if (utf) while ((ptr[startoffset] & 0xc0) == 0x80) startoffset++;
+ }
/* If the current match ended past the end of the line (only possible
in multiline mode), we must move on to the line in which it did end
@@ -2782,6 +2809,7 @@ while (ptr < endptr)
filepos += (int)(linelength + endlinelength);
linenumber++;
startoffset -= (int)(linelength + endlinelength);
+ endprevious -= (int)(linelength + endlinelength);
t = end_of_line(ptr, endptr, &endlinelength);
linelength = t - ptr - endlinelength;
length = (size_t)(endptr - ptr);
@@ -2797,7 +2825,18 @@ while (ptr < endptr)
loop for any that may follow. */
if (!match_patterns(ptr, length, options, startoffset, &mrc)) break;
- FWRITE_IGNORE(ptr + startoffset, 1, offsets[0] - startoffset, stdout);
+
+ /* The use of \K may make the end offset earlier than the start. In
+ this situation, swap them round. */
+
+ if (offsets[0] > offsets[1])
+ {
+ PCRE2_SIZE temp = offsets[0];
+ offsets[0] = offsets[1];
+ offsets[1] = temp;
+ }
+
+ FWRITE_IGNORE(ptr + endprevious, 1, offsets[0] - endprevious, stdout);
print_match(ptr + offsets[0], offsets[1] - offsets[0]);
}
@@ -2805,8 +2844,8 @@ while (ptr < endptr)
and its line-ending characters (if they matched the pattern), so there
may be no more to print. */
- plength = (int)((linelength + endlinelength) - startoffset);
- if (plength > 0) FWRITE_IGNORE(ptr + startoffset, 1, plength, stdout);
+ plength = (int)((linelength + endlinelength) - endprevious);
+ if (plength > 0) FWRITE_IGNORE(ptr + endprevious, 1, plength, stdout);
}
/* Not colouring or multiline; no need to search for further matches. */
diff --git a/testdata/grepoutput b/testdata/grepoutput
index 7e963fb..e49c2b2 100644
--- a/testdata/grepoutput
+++ b/testdata/grepoutput
@@ -936,3 +936,12 @@ end
16:start end in between start
end
RC=0
+---------------------------- Test 125 -----------------------------
+abcd
+RC=0
+abcd
+RC=0
+abcd
+RC=0
+abcd
+RC=0
--
2.13.6

View File

@ -0,0 +1,96 @@
From f442210323e228a407dfda75b6bb7a62e91111ee Mon Sep 17 00:00:00 2001
From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
Date: Sat, 23 Dec 2017 17:15:51 +0000
Subject: [PATCH 1/3] Update pcre2demo to deal with various \K inside assertion
anomalies.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@894 6239d852-aaf2-0410-a92c-79f79f948069
Petr Písař: Ported to 10.30.
---
src/pcre2demo.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
diff --git a/src/pcre2demo.c b/src/pcre2demo.c
index 8ae49f1..5d9b321 100644
--- a/src/pcre2demo.c
+++ b/src/pcre2demo.c
@@ -211,6 +211,21 @@ pcre2_match_data_create_from_pattern() above. */
if (rc == 0)
printf("ovector was not big enough for all the captured substrings\n");
+/* We must guard against patterns such as /(?=.\K)/ that use \K in an assertion
+to set the start of a match later than its end. In this demonstration program,
+we just detect this case and give up. */
+
+if (ovector[0] > ovector[1])
+ {
+ printf("\\K was used in an assertion to set the match start after its end.\n"
+ "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]),
+ (char *)(subject + ovector[1]));
+ printf("Run abandoned\n");
+ pcre2_match_data_free(match_data);
+ pcre2_code_free(re);
+ return 1;
+ }
+
/* Show substrings stored in the output vector by number. Obviously, in a real
application you might want to do things other than print them. */
@@ -338,6 +353,29 @@ for (;;)
options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
}
+ /* If the previous match was not an empty string, there is one tricky case to
+ consider. If a pattern contains \K within a lookbehind assertion at the
+ start, the end of the matched string can be at the offset where the match
+ started. Without special action, this leads to a loop that keeps on matching
+ the same substring. We must detect this case and arrange to move the start on
+ by one character. The pcre2_get_startchar() function returns the starting
+ offset that was passed to pcre2_match(). */
+
+ else
+ {
+ PCRE2_SIZE startchar = pcre2_get_startchar(match_data);
+ if (start_offset <= startchar)
+ {
+ if (startchar >= subject_length) break; /* Reached end of subject. */
+ start_offset = startchar + 1; /* Advance by one character. */
+ if (utf8) /* If UTF-8, it may be more */
+ { /* than one code unit. */
+ for (; start_offset < subject_length; start_offset++)
+ if ((subject[start_offset] & 0xc0) != 0x80) break;
+ }
+ }
+ }
+
/* Run the next matching operation */
rc = pcre2_match(
@@ -402,6 +440,21 @@ for (;;)
if (rc == 0)
printf("ovector was not big enough for all the captured substrings\n");
+ /* We must guard against patterns such as /(?=.\K)/ that use \K in an
+ assertion to set the start of a match later than its end. In this
+ demonstration program, we just detect this case and give up. */
+
+ if (ovector[0] > ovector[1])
+ {
+ printf("\\K was used in an assertion to set the match start after its end.\n"
+ "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]),
+ (char *)(subject + ovector[1]));
+ printf("Run abandoned\n");
+ pcre2_match_data_free(match_data);
+ pcre2_code_free(re);
+ return 1;
+ }
+
/* As before, show substrings stored in the output vector by number, and then
also any named substrings. */
--
2.13.6

View File

@ -6,7 +6,7 @@
#%%global rcversion RC1
Name: pcre2
Version: 10.30
Release: %{?rcversion:0.}4%{?rcversion:.%rcversion}%{?dist}
Release: %{?rcversion:0.}5%{?rcversion:.%rcversion}%{?dist}
%global myversion %{version}%{?rcversion:-%rcversion}
Summary: Perl-compatible regular expression library
# the library: BSD with exceptions
@ -62,6 +62,15 @@ Patch6: pcre2-10.30-Change-pcre2grep-line-number-and-count-variables-to-.pat
# Fix incorrect first matching character when a backreference with zero minimum
# repeat starts a pattern, upstream bug #2209, in upstream after 10.30
Patch7: pcre2-10.30-Fix-incorrect-first-matching-character-when-a-backre.patch
# 1/2 Fix handling \K in an assertion in documentation, upstream bug #2211,
# in upstream after 10.30
Patch8: pcre2-10.30-Update-pcre2demo-to-deal-with-various-K-inside-asser.patch
# 2/2 Fix handling \K in an assertion in documentation, upstream bug #2211,
# upstream bug #2211, in upstream after 10.30
Patch9: pcre2-10.30-Documentation-update.patch
# Fix handling \K in an assertion in pcre2grep tool, upstream bug #2211,
# in upstream after 10.30
Patch10: pcre2-10.30-Fix-K-issues-in-pcre2grep.patch
BuildRequires: autoconf
BuildRequires: automake
BuildRequires: coreutils
@ -143,6 +152,9 @@ Utilities demonstrating PCRE2 capabilities like pcre2grep or pcre2test.
%patch5 -p1
%patch6 -p1
%patch7 -p1
%patch8 -p1
%patch9 -p1
%patch10 -p1
# Because of multilib patch
libtoolize --copy --force
autoreconf -vif
@ -246,6 +258,10 @@ make %{?_smp_mflags} check VERBOSE=yes
%{_mandir}/man1/pcre2test.*
%changelog
* Fri Jan 12 2018 Petr Pisar <ppisar@redhat.com> - 10.30-5
- Fix handling \K in an assertion in pcre2grep tool and documentation
(upstream bug #2211)
* Fri Dec 22 2017 Petr Pisar <ppisar@redhat.com> - 10.30-4
- Fix pcre2_jit_match() to properly check the pattern was JIT-compiled
- Allow pcre2grep match counter to handle values larger than 2147483647,