Fix pcre2grep --only-matching output when number of capturing groups exceeds 32
This commit is contained in:
parent
9bf6f2950c
commit
324f5cdf72
@ -0,0 +1,382 @@
|
|||||||
|
From e29388de53ea3a4f9d1c6b4932613681493ac9dc Mon Sep 17 00:00:00 2001
|
||||||
|
From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
|
||||||
|
Date: Sat, 15 Jun 2019 15:51:07 +0000
|
||||||
|
Subject: [PATCH] Fix pcre2grep -o bug when ovector overflows; add option to
|
||||||
|
adjust the limit; raise the default limit; give error if -o requests an
|
||||||
|
uncaptured parens.
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@1106 6239d852-aaf2-0410-a92c-79f79f948069
|
||||||
|
Petr Písař: Ported to 10.33.
|
||||||
|
|
||||||
|
Signed-off-by: Petr Písař <ppisar@redhat.com>
|
||||||
|
---
|
||||||
|
RunGrepTest | 7 ++++++
|
||||||
|
doc/html/pcre2api.html | 12 +++++-----
|
||||||
|
doc/html/pcre2grep.html | 28 +++++++++++++++-------
|
||||||
|
doc/html/pcre2test.html | 4 +++-
|
||||||
|
doc/pcre2grep.1 | 26 +++++++++++++-------
|
||||||
|
doc/pcre2grep.txt | 43 ++++++++++++++++++++-------------
|
||||||
|
doc/pcre2test.txt | 4 +++-
|
||||||
|
src/pcre2grep.c | 53 ++++++++++++++++++++++++++++-------------
|
||||||
|
testdata/grepoutput | 7 ++++++
|
||||||
|
9 files changed, 126 insertions(+), 58 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/RunGrepTest b/RunGrepTest
|
||||||
|
index bac1f1b..ea37f70 100755
|
||||||
|
--- a/RunGrepTest
|
||||||
|
+++ b/RunGrepTest
|
||||||
|
@@ -653,6 +653,13 @@ printf 'ABC\0XYZ\nABCDEF\nDEFABC\n' >testtemp2grep
|
||||||
|
$valgrind $vjs $pcre2grep -a -f testtemp1grep testtemp2grep >>testtrygrep
|
||||||
|
echo "RC=$?" >>testtrygrep
|
||||||
|
|
||||||
|
+echo "---------------------------- Test 127 -----------------------------" >>testtrygrep
|
||||||
|
+(cd $srcdir; $valgrind $vjs $pcre2grep -o --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep
|
||||||
|
+echo "RC=$?" >>testtrygrep
|
||||||
|
+
|
||||||
|
+echo "---------------------------- Test 128 -----------------------------" >>testtrygrep
|
||||||
|
+(cd $srcdir; $valgrind $vjs $pcre2grep -o1 --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep 2>&1
|
||||||
|
+echo "RC=$?" >>testtrygrep
|
||||||
|
|
||||||
|
# Now compare the results.
|
||||||
|
|
||||||
|
diff --git a/doc/html/pcre2api.html b/doc/html/pcre2api.html
|
||||||
|
index 7ca39f5..84f4442 100644
|
||||||
|
--- a/doc/html/pcre2api.html
|
||||||
|
+++ b/doc/html/pcre2api.html
|
||||||
|
@@ -2252,12 +2252,12 @@ segment.
|
||||||
|
PCRE2_INFO_MINLENGTH
|
||||||
|
</pre>
|
||||||
|
If a minimum length for matching subject strings was computed, its value is
|
||||||
|
-returned. Otherwise the returned value is 0. The value is a number of
|
||||||
|
-characters, which in UTF mode may be different from the number of code units.
|
||||||
|
-The third argument should point to an <b>uint32_t</b> variable. The value is a
|
||||||
|
-lower bound to the length of any matching string. There may not be any strings
|
||||||
|
-of that length that do actually match, but every string that does match is at
|
||||||
|
-least that long.
|
||||||
|
+returned. Otherwise the returned value is 0. This value is not computed when
|
||||||
|
+PCRE2_NO_START_OPTIMIZE is set. The value is a number of characters, which in
|
||||||
|
+UTF mode may be different from the number of code units. The third argument
|
||||||
|
+should point to an <b>uint32_t</b> variable. The value is a lower bound to the
|
||||||
|
+length of any matching string. There may not be any strings of that length that
|
||||||
|
+do actually match, but every string that does match is at least that long.
|
||||||
|
<pre>
|
||||||
|
PCRE2_INFO_NAMECOUNT
|
||||||
|
PCRE2_INFO_NAMEENTRYSIZE
|
||||||
|
diff --git a/doc/html/pcre2grep.html b/doc/html/pcre2grep.html
|
||||||
|
index d66cee3..de699e7 100644
|
||||||
|
--- a/doc/html/pcre2grep.html
|
||||||
|
+++ b/doc/html/pcre2grep.html
|
||||||
|
@@ -685,20 +685,32 @@ otherwise empty line. This option is mutually exclusive with <b>--output</b>,
|
||||||
|
<P>
|
||||||
|
<b>-o</b><i>number</i>, <b>--only-matching</b>=<i>number</i>
|
||||||
|
Show only the part of the line that matched the capturing parentheses of the
|
||||||
|
-given number. Up to 32 capturing parentheses are supported, and -o0 is
|
||||||
|
-equivalent to <b>-o</b> without a number. Because these options can be given
|
||||||
|
-without an argument (see above), if an argument is present, it must be given in
|
||||||
|
-the same shell item, for example, -o3 or --only-matching=2. The comments given
|
||||||
|
-for the non-argument case above also apply to this option. If the specified
|
||||||
|
-capturing parentheses do not exist in the pattern, or were not set in the
|
||||||
|
-match, nothing is output unless the file name or line number are being output.
|
||||||
|
+given number. Up to 50 capturing parentheses are supported by default. This
|
||||||
|
+limit can be changed via the <b>--om-capture</b> option. A pattern may contain
|
||||||
|
+any number of capturing parentheses, but only those whose number is within the
|
||||||
|
+limit can be accessed by <b>-o</b>. An error occurs if the number specified by
|
||||||
|
+<b>-o</b> is greater than the limit.
|
||||||
|
+<br>
|
||||||
|
+<br>
|
||||||
|
+-o0 is the same as <b>-o</b> without a number. Because these options can be
|
||||||
|
+given without an argument (see above), if an argument is present, it must be
|
||||||
|
+given in the same shell item, for example, -o3 or --only-matching=2. The
|
||||||
|
+comments given for the non-argument case above also apply to this option. If
|
||||||
|
+the specified capturing parentheses do not exist in the pattern, or were not
|
||||||
|
+set in the match, nothing is output unless the file name or line number are
|
||||||
|
+being output.
|
||||||
|
<br>
|
||||||
|
<br>
|
||||||
|
If this option is given multiple times, multiple substrings are output for each
|
||||||
|
match, in the order the options are given, and all on one line. For example,
|
||||||
|
-o3 -o1 -o3 causes the substrings matched by capturing parentheses 3 and 1 and
|
||||||
|
then 3 again to be output. By default, there is no separator (but see the next
|
||||||
|
-option).
|
||||||
|
+but one option).
|
||||||
|
+</P>
|
||||||
|
+<P>
|
||||||
|
+<b>--om-capture</b>=<i>number</i>
|
||||||
|
+Set the number of capturing parentheses that can be accessed by <b>-o</b>. The
|
||||||
|
+default is 50.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--om-separator</b>=<i>text</i>
|
||||||
|
diff --git a/doc/html/pcre2test.html b/doc/html/pcre2test.html
|
||||||
|
index 083d5cc..4be47c6 100644
|
||||||
|
--- a/doc/html/pcre2test.html
|
||||||
|
+++ b/doc/html/pcre2test.html
|
||||||
|
@@ -738,7 +738,9 @@ options, the line is omitted. "First code unit" is where any match must start;
|
||||||
|
if there is more than one they are listed as "starting code units". "Last code
|
||||||
|
unit" is the last literal code unit that must be present in any match. This is
|
||||||
|
not necessarily the last character. These lines are omitted if no starting or
|
||||||
|
-ending code units are recorded.
|
||||||
|
+ending code units are recorded. The subject length line is omitted when
|
||||||
|
+<b>no_start_optimize</b> is set because the minimum length is not calculated
|
||||||
|
+when it can never be used.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
The <b>framesize</b> modifier shows the size, in bytes, of the storage frames
|
||||||
|
diff --git a/doc/pcre2grep.1 b/doc/pcre2grep.1
|
||||||
|
index 6b3219b..1dcdb68 100644
|
||||||
|
--- a/doc/pcre2grep.1
|
||||||
|
+++ b/doc/pcre2grep.1
|
||||||
|
@@ -596,19 +596,29 @@ otherwise empty line. This option is mutually exclusive with \fB--output\fP,
|
||||||
|
.TP
|
||||||
|
\fB-o\fP\fInumber\fP, \fB--only-matching\fP=\fInumber\fP
|
||||||
|
Show only the part of the line that matched the capturing parentheses of the
|
||||||
|
-given number. Up to 32 capturing parentheses are supported, and -o0 is
|
||||||
|
-equivalent to \fB-o\fP without a number. Because these options can be given
|
||||||
|
-without an argument (see above), if an argument is present, it must be given in
|
||||||
|
-the same shell item, for example, -o3 or --only-matching=2. The comments given
|
||||||
|
-for the non-argument case above also apply to this option. If the specified
|
||||||
|
-capturing parentheses do not exist in the pattern, or were not set in the
|
||||||
|
-match, nothing is output unless the file name or line number are being output.
|
||||||
|
+given number. Up to 50 capturing parentheses are supported by default. This
|
||||||
|
+limit can be changed via the \fB--om-capture\fP option. A pattern may contain
|
||||||
|
+any number of capturing parentheses, but only those whose number is within the
|
||||||
|
+limit can be accessed by \fB-o\fP. An error occurs if the number specified by
|
||||||
|
+\fB-o\fP is greater than the limit.
|
||||||
|
+.sp
|
||||||
|
+-o0 is the same as \fB-o\fP without a number. Because these options can be
|
||||||
|
+given without an argument (see above), if an argument is present, it must be
|
||||||
|
+given in the same shell item, for example, -o3 or --only-matching=2. The
|
||||||
|
+comments given for the non-argument case above also apply to this option. If
|
||||||
|
+the specified capturing parentheses do not exist in the pattern, or were not
|
||||||
|
+set in the match, nothing is output unless the file name or line number are
|
||||||
|
+being output.
|
||||||
|
.sp
|
||||||
|
If this option is given multiple times, multiple substrings are output for each
|
||||||
|
match, in the order the options are given, and all on one line. For example,
|
||||||
|
-o3 -o1 -o3 causes the substrings matched by capturing parentheses 3 and 1 and
|
||||||
|
then 3 again to be output. By default, there is no separator (but see the next
|
||||||
|
-option).
|
||||||
|
+but one option).
|
||||||
|
+.TP
|
||||||
|
+\fB--om-capture\fP=\fInumber\fP
|
||||||
|
+Set the number of capturing parentheses that can be accessed by \fB-o\fP. The
|
||||||
|
+default is 50.
|
||||||
|
.TP
|
||||||
|
\fB--om-separator\fP=\fItext\fP
|
||||||
|
Specify a separating string for multiple occurrences of \fB-o\fP. The default
|
||||||
|
diff --git a/doc/pcre2grep.txt b/doc/pcre2grep.txt
|
||||||
|
index cd44fe0..2920643 100644
|
||||||
|
--- a/doc/pcre2grep.txt
|
||||||
|
+++ b/doc/pcre2grep.txt
|
||||||
|
@@ -662,23 +662,32 @@ OPTIONS
|
||||||
|
|
||||||
|
-onumber, --only-matching=number
|
||||||
|
Show only the part of the line that matched the capturing
|
||||||
|
- parentheses of the given number. Up to 32 capturing parenthe-
|
||||||
|
- ses are supported, and -o0 is equivalent to -o without a num-
|
||||||
|
- ber. Because these options can be given without an argument
|
||||||
|
- (see above), if an argument is present, it must be given in
|
||||||
|
- the same shell item, for example, -o3 or --only-matching=2.
|
||||||
|
- The comments given for the non-argument case above also apply
|
||||||
|
- to this option. If the specified capturing parentheses do not
|
||||||
|
- exist in the pattern, or were not set in the match, nothing
|
||||||
|
- is output unless the file name or line number are being out-
|
||||||
|
- put.
|
||||||
|
-
|
||||||
|
- If this option is given multiple times, multiple substrings
|
||||||
|
- are output for each match, in the order the options are
|
||||||
|
- given, and all on one line. For example, -o3 -o1 -o3 causes
|
||||||
|
- the substrings matched by capturing parentheses 3 and 1 and
|
||||||
|
- then 3 again to be output. By default, there is no separator
|
||||||
|
- (but see the next option).
|
||||||
|
+ parentheses of the given number. Up to 50 capturing parenthe-
|
||||||
|
+ ses are supported by default. This limit can be changed via
|
||||||
|
+ the --om-capture option. A pattern may contain any number of
|
||||||
|
+ capturing parentheses, but only those whose number is within
|
||||||
|
+ the limit can be accessed by -o. An error occurs if the num-
|
||||||
|
+ ber specified by -o is greater than the limit.
|
||||||
|
+
|
||||||
|
+ -o0 is the same as -o without a number. Because these options
|
||||||
|
+ can be given without an argument (see above), if an argument
|
||||||
|
+ is present, it must be given in the same shell item, for
|
||||||
|
+ example, -o3 or --only-matching=2. The comments given for the
|
||||||
|
+ non-argument case above also apply to this option. If the
|
||||||
|
+ specified capturing parentheses do not exist in the pattern,
|
||||||
|
+ or were not set in the match, nothing is output unless the
|
||||||
|
+ file name or line number are being output.
|
||||||
|
+
|
||||||
|
+ If this option is given multiple times, multiple substrings
|
||||||
|
+ are output for each match, in the order the options are
|
||||||
|
+ given, and all on one line. For example, -o3 -o1 -o3 causes
|
||||||
|
+ the substrings matched by capturing parentheses 3 and 1 and
|
||||||
|
+ then 3 again to be output. By default, there is no separator
|
||||||
|
+ (but see the next but one option).
|
||||||
|
+
|
||||||
|
+ --om-capture=number
|
||||||
|
+ Set the number of capturing parentheses that can be accessed
|
||||||
|
+ by -o. The default is 50.
|
||||||
|
|
||||||
|
--om-separator=text
|
||||||
|
Specify a separating string for multiple occurrences of -o.
|
||||||
|
diff --git a/doc/pcre2test.txt b/doc/pcre2test.txt
|
||||||
|
index cbe3528..f287f6d 100644
|
||||||
|
--- a/doc/pcre2test.txt
|
||||||
|
+++ b/doc/pcre2test.txt
|
||||||
|
@@ -669,7 +669,9 @@ PATTERN MODIFIERS
|
||||||
|
as "starting code units". "Last code unit" is the last literal code
|
||||||
|
unit that must be present in any match. This is not necessarily the
|
||||||
|
last character. These lines are omitted if no starting or ending code
|
||||||
|
- units are recorded.
|
||||||
|
+ units are recorded. The subject length line is omitted when
|
||||||
|
+ no_start_optimize is set because the minimum length is not calculated
|
||||||
|
+ when it can never be used.
|
||||||
|
|
||||||
|
The framesize modifier shows the size, in bytes, of the storage frames
|
||||||
|
used by pcre2_match() for handling backtracking. The size depends on
|
||||||
|
diff --git a/src/pcre2grep.c b/src/pcre2grep.c
|
||||||
|
index a3cc3ec..d17cd2a 100644
|
||||||
|
--- a/src/pcre2grep.c
|
||||||
|
+++ b/src/pcre2grep.c
|
||||||
|
@@ -115,7 +115,7 @@ MSVC 10/2010. Except for VC6 (which is missing some fundamentals and fails). */
|
||||||
|
|
||||||
|
typedef int BOOL;
|
||||||
|
|
||||||
|
-#define OFFSET_SIZE 33
|
||||||
|
+#define DEFAULT_CAPTURE_MAX 50
|
||||||
|
|
||||||
|
#if BUFSIZ > 8192
|
||||||
|
#define MAXPATLEN BUFSIZ
|
||||||
|
@@ -242,6 +242,8 @@ static pcre2_compile_context *compile_context;
|
||||||
|
static pcre2_match_context *match_context;
|
||||||
|
static pcre2_match_data *match_data;
|
||||||
|
static PCRE2_SIZE *offsets;
|
||||||
|
+static uint32_t offset_size;
|
||||||
|
+static uint32_t capture_max = DEFAULT_CAPTURE_MAX;
|
||||||
|
|
||||||
|
static BOOL count_only = FALSE;
|
||||||
|
static BOOL do_colour = FALSE;
|
||||||
|
@@ -391,6 +393,7 @@ used to identify them. */
|
||||||
|
#define N_INCLUDE_FROM (-21)
|
||||||
|
#define N_OM_SEPARATOR (-22)
|
||||||
|
#define N_MAX_BUFSIZE (-23)
|
||||||
|
+#define N_OM_CAPTURE (-24)
|
||||||
|
|
||||||
|
static option_item optionlist[] = {
|
||||||
|
{ OP_NODATA, N_NULL, NULL, "", "terminate options" },
|
||||||
|
@@ -437,6 +440,7 @@ static option_item optionlist[] = {
|
||||||
|
{ OP_STRING, 'O', &output_text, "output=text", "show only this text (possibly expanded)" },
|
||||||
|
{ OP_OP_NUMBERS, 'o', &only_matching_data, "only-matching=n", "show only the part of the line that matched" },
|
||||||
|
{ OP_STRING, N_OM_SEPARATOR, &om_separator, "om-separator=text", "set separator for multiple -o output" },
|
||||||
|
+ { OP_U32NUMBER, N_OM_CAPTURE, &capture_max, "om-capture=n", "set capture count for --only-matching" },
|
||||||
|
{ OP_NODATA, 'q', NULL, "quiet", "suppress output, just set return code" },
|
||||||
|
{ OP_NODATA, 'r', NULL, "recursive", "recursively scan sub-directories" },
|
||||||
|
{ OP_PATLIST, N_EXCLUDE,&exclude_patdata, "exclude=pattern","exclude matching files when recursing" },
|
||||||
|
@@ -2568,7 +2572,7 @@ while (ptr < endptr)
|
||||||
|
|
||||||
|
for (i = 0; i < jfriedl_XR; i++)
|
||||||
|
match = (pcre_exec(patterns->compiled, patterns->hint, ptr, length, 0,
|
||||||
|
- PCRE2_NOTEMPTY, offsets, OFFSET_SIZE) >= 0);
|
||||||
|
+ PCRE2_NOTEMPTY, offsets, offset_size) >= 0);
|
||||||
|
|
||||||
|
if (gettimeofday(&end_time, &dummy) != 0)
|
||||||
|
perror("bad gettimeofday");
|
||||||
|
@@ -2688,7 +2692,7 @@ while (ptr < endptr)
|
||||||
|
for (om = only_matching; om != NULL; om = om->next)
|
||||||
|
{
|
||||||
|
int n = om->groupnum;
|
||||||
|
- if (n < mrc)
|
||||||
|
+ if (n == 0 || n < mrc)
|
||||||
|
{
|
||||||
|
int plen = offsets[2*n + 1] - offsets[2*n];
|
||||||
|
if (plen > 0)
|
||||||
|
@@ -3639,6 +3643,7 @@ int rc = 1;
|
||||||
|
BOOL only_one_at_top;
|
||||||
|
patstr *cp;
|
||||||
|
fnstr *fn;
|
||||||
|
+omstr *om;
|
||||||
|
const char *locale_from = "--locale";
|
||||||
|
|
||||||
|
#ifdef SUPPORT_PCRE2GREP_JIT
|
||||||
|
@@ -3655,20 +3660,6 @@ must use STDOUT_NL to terminate lines. */
|
||||||
|
_setmode(_fileno(stdout), _O_BINARY);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
-/* Set up a default compile and match contexts and a match data block. */
|
||||||
|
-
|
||||||
|
-compile_context = pcre2_compile_context_create(NULL);
|
||||||
|
-match_context = pcre2_match_context_create(NULL);
|
||||||
|
-match_data = pcre2_match_data_create(OFFSET_SIZE, NULL);
|
||||||
|
-offsets = pcre2_get_ovector_pointer(match_data);
|
||||||
|
-
|
||||||
|
-/* If string (script) callouts are supported, set up the callout processing
|
||||||
|
-function. */
|
||||||
|
-
|
||||||
|
-#ifdef SUPPORT_PCRE2GREP_CALLOUT
|
||||||
|
-pcre2_set_callout(match_context, pcre2grep_callout, NULL);
|
||||||
|
-#endif
|
||||||
|
-
|
||||||
|
/* Process the options */
|
||||||
|
|
||||||
|
for (i = 1; i < argc; i++)
|
||||||
|
@@ -4015,12 +4006,40 @@ if (only_matching_count > 1)
|
||||||
|
pcre2grep_exit(usage(2));
|
||||||
|
}
|
||||||
|
|
||||||
|
+/* Check that there is a big enough ovector for all -o settings. */
|
||||||
|
+
|
||||||
|
+for (om = only_matching; om != NULL; om = om->next)
|
||||||
|
+ {
|
||||||
|
+ int n = om->groupnum;
|
||||||
|
+ if (n > (int)capture_max)
|
||||||
|
+ {
|
||||||
|
+ fprintf(stderr, "pcre2grep: Requested group %d cannot be captured.\n", n);
|
||||||
|
+ fprintf(stderr, "pcre2grep: Use --om-capture to increase the size of the capture vector.\n");
|
||||||
|
+ goto EXIT2;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
/* Check the text supplied to --output for errors. */
|
||||||
|
|
||||||
|
if (output_text != NULL &&
|
||||||
|
!syntax_check_output_text((PCRE2_SPTR)output_text, FALSE))
|
||||||
|
goto EXIT2;
|
||||||
|
|
||||||
|
+/* Set up default compile and match contexts and a match data block. */
|
||||||
|
+
|
||||||
|
+offset_size = capture_max + 1;
|
||||||
|
+compile_context = pcre2_compile_context_create(NULL);
|
||||||
|
+match_context = pcre2_match_context_create(NULL);
|
||||||
|
+match_data = pcre2_match_data_create(offset_size, NULL);
|
||||||
|
+offsets = pcre2_get_ovector_pointer(match_data);
|
||||||
|
+
|
||||||
|
+/* If string (script) callouts are supported, set up the callout processing
|
||||||
|
+function. */
|
||||||
|
+
|
||||||
|
+#ifdef SUPPORT_PCRE2GREP_CALLOUT
|
||||||
|
+pcre2_set_callout(match_context, pcre2grep_callout, NULL);
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
/* Put limits into the match data block. */
|
||||||
|
|
||||||
|
if (heap_limit != PCRE2_UNSET) pcre2_set_heap_limit(match_context, heap_limit);
|
||||||
|
diff --git a/testdata/grepoutput b/testdata/grepoutput
|
||||||
|
index 2bd69be..a9297e1 100644
|
||||||
|
--- a/testdata/grepoutput
|
||||||
|
+++ b/testdata/grepoutput
|
||||||
|
@@ -949,3 +949,10 @@ RC=0
|
||||||
|
---------------------------- Test 126 -----------------------------
|
||||||
|
ABC |