383 lines
17 KiB
Diff
383 lines
17 KiB
Diff
From e29388de53ea3a4f9d1c6b4932613681493ac9dc Mon Sep 17 00:00:00 2001
|
||
From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
|
||
Date: Sat, 15 Jun 2019 15:51:07 +0000
|
||
Subject: [PATCH] Fix pcre2grep -o bug when ovector overflows; add option to
|
||
adjust the limit; raise the default limit; give error if -o requests an
|
||
uncaptured parens.
|
||
MIME-Version: 1.0
|
||
Content-Type: text/plain; charset=UTF-8
|
||
Content-Transfer-Encoding: 8bit
|
||
|
||
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@1106 6239d852-aaf2-0410-a92c-79f79f948069
|
||
Petr Písař: Ported to 10.33.
|
||
|
||
Signed-off-by: Petr Písař <ppisar@redhat.com>
|
||
---
|
||
RunGrepTest | 7 ++++++
|
||
doc/html/pcre2api.html | 12 +++++-----
|
||
doc/html/pcre2grep.html | 28 +++++++++++++++-------
|
||
doc/html/pcre2test.html | 4 +++-
|
||
doc/pcre2grep.1 | 26 +++++++++++++-------
|
||
doc/pcre2grep.txt | 43 ++++++++++++++++++++-------------
|
||
doc/pcre2test.txt | 4 +++-
|
||
src/pcre2grep.c | 53 ++++++++++++++++++++++++++++-------------
|
||
testdata/grepoutput | 7 ++++++
|
||
9 files changed, 126 insertions(+), 58 deletions(-)
|
||
|
||
diff --git a/RunGrepTest b/RunGrepTest
|
||
index bac1f1b..ea37f70 100755
|
||
--- a/RunGrepTest
|
||
+++ b/RunGrepTest
|
||
@@ -653,6 +653,13 @@ printf 'ABC\0XYZ\nABCDEF\nDEFABC\n' >testtemp2grep
|
||
$valgrind $vjs $pcre2grep -a -f testtemp1grep testtemp2grep >>testtrygrep
|
||
echo "RC=$?" >>testtrygrep
|
||
|
||
+echo "---------------------------- Test 127 -----------------------------" >>testtrygrep
|
||
+(cd $srcdir; $valgrind $vjs $pcre2grep -o --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep
|
||
+echo "RC=$?" >>testtrygrep
|
||
+
|
||
+echo "---------------------------- Test 128 -----------------------------" >>testtrygrep
|
||
+(cd $srcdir; $valgrind $vjs $pcre2grep -o1 --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep 2>&1
|
||
+echo "RC=$?" >>testtrygrep
|
||
|
||
# Now compare the results.
|
||
|
||
diff --git a/doc/html/pcre2api.html b/doc/html/pcre2api.html
|
||
index 7ca39f5..84f4442 100644
|
||
--- a/doc/html/pcre2api.html
|
||
+++ b/doc/html/pcre2api.html
|
||
@@ -2252,12 +2252,12 @@ segment.
|
||
PCRE2_INFO_MINLENGTH
|
||
</pre>
|
||
If a minimum length for matching subject strings was computed, its value is
|
||
-returned. Otherwise the returned value is 0. The value is a number of
|
||
-characters, which in UTF mode may be different from the number of code units.
|
||
-The third argument should point to an <b>uint32_t</b> variable. The value is a
|
||
-lower bound to the length of any matching string. There may not be any strings
|
||
-of that length that do actually match, but every string that does match is at
|
||
-least that long.
|
||
+returned. Otherwise the returned value is 0. This value is not computed when
|
||
+PCRE2_NO_START_OPTIMIZE is set. The value is a number of characters, which in
|
||
+UTF mode may be different from the number of code units. The third argument
|
||
+should point to an <b>uint32_t</b> variable. The value is a lower bound to the
|
||
+length of any matching string. There may not be any strings of that length that
|
||
+do actually match, but every string that does match is at least that long.
|
||
<pre>
|
||
PCRE2_INFO_NAMECOUNT
|
||
PCRE2_INFO_NAMEENTRYSIZE
|
||
diff --git a/doc/html/pcre2grep.html b/doc/html/pcre2grep.html
|
||
index d66cee3..de699e7 100644
|
||
--- a/doc/html/pcre2grep.html
|
||
+++ b/doc/html/pcre2grep.html
|
||
@@ -685,20 +685,32 @@ otherwise empty line. This option is mutually exclusive with <b>--output</b>,
|
||
<P>
|
||
<b>-o</b><i>number</i>, <b>--only-matching</b>=<i>number</i>
|
||
Show only the part of the line that matched the capturing parentheses of the
|
||
-given number. Up to 32 capturing parentheses are supported, and -o0 is
|
||
-equivalent to <b>-o</b> without a number. Because these options can be given
|
||
-without an argument (see above), if an argument is present, it must be given in
|
||
-the same shell item, for example, -o3 or --only-matching=2. The comments given
|
||
-for the non-argument case above also apply to this option. If the specified
|
||
-capturing parentheses do not exist in the pattern, or were not set in the
|
||
-match, nothing is output unless the file name or line number are being output.
|
||
+given number. Up to 50 capturing parentheses are supported by default. This
|
||
+limit can be changed via the <b>--om-capture</b> option. A pattern may contain
|
||
+any number of capturing parentheses, but only those whose number is within the
|
||
+limit can be accessed by <b>-o</b>. An error occurs if the number specified by
|
||
+<b>-o</b> is greater than the limit.
|
||
+<br>
|
||
+<br>
|
||
+-o0 is the same as <b>-o</b> without a number. Because these options can be
|
||
+given without an argument (see above), if an argument is present, it must be
|
||
+given in the same shell item, for example, -o3 or --only-matching=2. The
|
||
+comments given for the non-argument case above also apply to this option. If
|
||
+the specified capturing parentheses do not exist in the pattern, or were not
|
||
+set in the match, nothing is output unless the file name or line number are
|
||
+being output.
|
||
<br>
|
||
<br>
|
||
If this option is given multiple times, multiple substrings are output for each
|
||
match, in the order the options are given, and all on one line. For example,
|
||
-o3 -o1 -o3 causes the substrings matched by capturing parentheses 3 and 1 and
|
||
then 3 again to be output. By default, there is no separator (but see the next
|
||
-option).
|
||
+but one option).
|
||
+</P>
|
||
+<P>
|
||
+<b>--om-capture</b>=<i>number</i>
|
||
+Set the number of capturing parentheses that can be accessed by <b>-o</b>. The
|
||
+default is 50.
|
||
</P>
|
||
<P>
|
||
<b>--om-separator</b>=<i>text</i>
|
||
diff --git a/doc/html/pcre2test.html b/doc/html/pcre2test.html
|
||
index 083d5cc..4be47c6 100644
|
||
--- a/doc/html/pcre2test.html
|
||
+++ b/doc/html/pcre2test.html
|
||
@@ -738,7 +738,9 @@ options, the line is omitted. "First code unit" is where any match must start;
|
||
if there is more than one they are listed as "starting code units". "Last code
|
||
unit" is the last literal code unit that must be present in any match. This is
|
||
not necessarily the last character. These lines are omitted if no starting or
|
||
-ending code units are recorded.
|
||
+ending code units are recorded. The subject length line is omitted when
|
||
+<b>no_start_optimize</b> is set because the minimum length is not calculated
|
||
+when it can never be used.
|
||
</P>
|
||
<P>
|
||
The <b>framesize</b> modifier shows the size, in bytes, of the storage frames
|
||
diff --git a/doc/pcre2grep.1 b/doc/pcre2grep.1
|
||
index 6b3219b..1dcdb68 100644
|
||
--- a/doc/pcre2grep.1
|
||
+++ b/doc/pcre2grep.1
|
||
@@ -596,19 +596,29 @@ otherwise empty line. This option is mutually exclusive with \fB--output\fP,
|
||
.TP
|
||
\fB-o\fP\fInumber\fP, \fB--only-matching\fP=\fInumber\fP
|
||
Show only the part of the line that matched the capturing parentheses of the
|
||
-given number. Up to 32 capturing parentheses are supported, and -o0 is
|
||
-equivalent to \fB-o\fP without a number. Because these options can be given
|
||
-without an argument (see above), if an argument is present, it must be given in
|
||
-the same shell item, for example, -o3 or --only-matching=2. The comments given
|
||
-for the non-argument case above also apply to this option. If the specified
|
||
-capturing parentheses do not exist in the pattern, or were not set in the
|
||
-match, nothing is output unless the file name or line number are being output.
|
||
+given number. Up to 50 capturing parentheses are supported by default. This
|
||
+limit can be changed via the \fB--om-capture\fP option. A pattern may contain
|
||
+any number of capturing parentheses, but only those whose number is within the
|
||
+limit can be accessed by \fB-o\fP. An error occurs if the number specified by
|
||
+\fB-o\fP is greater than the limit.
|
||
+.sp
|
||
+-o0 is the same as \fB-o\fP without a number. Because these options can be
|
||
+given without an argument (see above), if an argument is present, it must be
|
||
+given in the same shell item, for example, -o3 or --only-matching=2. The
|
||
+comments given for the non-argument case above also apply to this option. If
|
||
+the specified capturing parentheses do not exist in the pattern, or were not
|
||
+set in the match, nothing is output unless the file name or line number are
|
||
+being output.
|
||
.sp
|
||
If this option is given multiple times, multiple substrings are output for each
|
||
match, in the order the options are given, and all on one line. For example,
|
||
-o3 -o1 -o3 causes the substrings matched by capturing parentheses 3 and 1 and
|
||
then 3 again to be output. By default, there is no separator (but see the next
|
||
-option).
|
||
+but one option).
|
||
+.TP
|
||
+\fB--om-capture\fP=\fInumber\fP
|
||
+Set the number of capturing parentheses that can be accessed by \fB-o\fP. The
|
||
+default is 50.
|
||
.TP
|
||
\fB--om-separator\fP=\fItext\fP
|
||
Specify a separating string for multiple occurrences of \fB-o\fP. The default
|
||
diff --git a/doc/pcre2grep.txt b/doc/pcre2grep.txt
|
||
index cd44fe0..2920643 100644
|
||
--- a/doc/pcre2grep.txt
|
||
+++ b/doc/pcre2grep.txt
|
||
@@ -662,23 +662,32 @@ OPTIONS
|
||
|
||
-onumber, --only-matching=number
|
||
Show only the part of the line that matched the capturing
|
||
- parentheses of the given number. Up to 32 capturing parenthe-
|
||
- ses are supported, and -o0 is equivalent to -o without a num-
|
||
- ber. Because these options can be given without an argument
|
||
- (see above), if an argument is present, it must be given in
|
||
- the same shell item, for example, -o3 or --only-matching=2.
|
||
- The comments given for the non-argument case above also apply
|
||
- to this option. If the specified capturing parentheses do not
|
||
- exist in the pattern, or were not set in the match, nothing
|
||
- is output unless the file name or line number are being out-
|
||
- put.
|
||
-
|
||
- If this option is given multiple times, multiple substrings
|
||
- are output for each match, in the order the options are
|
||
- given, and all on one line. For example, -o3 -o1 -o3 causes
|
||
- the substrings matched by capturing parentheses 3 and 1 and
|
||
- then 3 again to be output. By default, there is no separator
|
||
- (but see the next option).
|
||
+ parentheses of the given number. Up to 50 capturing parenthe-
|
||
+ ses are supported by default. This limit can be changed via
|
||
+ the --om-capture option. A pattern may contain any number of
|
||
+ capturing parentheses, but only those whose number is within
|
||
+ the limit can be accessed by -o. An error occurs if the num-
|
||
+ ber specified by -o is greater than the limit.
|
||
+
|
||
+ -o0 is the same as -o without a number. Because these options
|
||
+ can be given without an argument (see above), if an argument
|
||
+ is present, it must be given in the same shell item, for
|
||
+ example, -o3 or --only-matching=2. The comments given for the
|
||
+ non-argument case above also apply to this option. If the
|
||
+ specified capturing parentheses do not exist in the pattern,
|
||
+ or were not set in the match, nothing is output unless the
|
||
+ file name or line number are being output.
|
||
+
|
||
+ If this option is given multiple times, multiple substrings
|
||
+ are output for each match, in the order the options are
|
||
+ given, and all on one line. For example, -o3 -o1 -o3 causes
|
||
+ the substrings matched by capturing parentheses 3 and 1 and
|
||
+ then 3 again to be output. By default, there is no separator
|
||
+ (but see the next but one option).
|
||
+
|
||
+ --om-capture=number
|
||
+ Set the number of capturing parentheses that can be accessed
|
||
+ by -o. The default is 50.
|
||
|
||
--om-separator=text
|
||
Specify a separating string for multiple occurrences of -o.
|
||
diff --git a/doc/pcre2test.txt b/doc/pcre2test.txt
|
||
index cbe3528..f287f6d 100644
|
||
--- a/doc/pcre2test.txt
|
||
+++ b/doc/pcre2test.txt
|
||
@@ -669,7 +669,9 @@ PATTERN MODIFIERS
|
||
as "starting code units". "Last code unit" is the last literal code
|
||
unit that must be present in any match. This is not necessarily the
|
||
last character. These lines are omitted if no starting or ending code
|
||
- units are recorded.
|
||
+ units are recorded. The subject length line is omitted when
|
||
+ no_start_optimize is set because the minimum length is not calculated
|
||
+ when it can never be used.
|
||
|
||
The framesize modifier shows the size, in bytes, of the storage frames
|
||
used by pcre2_match() for handling backtracking. The size depends on
|
||
diff --git a/src/pcre2grep.c b/src/pcre2grep.c
|
||
index a3cc3ec..d17cd2a 100644
|
||
--- a/src/pcre2grep.c
|
||
+++ b/src/pcre2grep.c
|
||
@@ -115,7 +115,7 @@ MSVC 10/2010. Except for VC6 (which is missing some fundamentals and fails). */
|
||
|
||
typedef int BOOL;
|
||
|
||
-#define OFFSET_SIZE 33
|
||
+#define DEFAULT_CAPTURE_MAX 50
|
||
|
||
#if BUFSIZ > 8192
|
||
#define MAXPATLEN BUFSIZ
|
||
@@ -242,6 +242,8 @@ static pcre2_compile_context *compile_context;
|
||
static pcre2_match_context *match_context;
|
||
static pcre2_match_data *match_data;
|
||
static PCRE2_SIZE *offsets;
|
||
+static uint32_t offset_size;
|
||
+static uint32_t capture_max = DEFAULT_CAPTURE_MAX;
|
||
|
||
static BOOL count_only = FALSE;
|
||
static BOOL do_colour = FALSE;
|
||
@@ -391,6 +393,7 @@ used to identify them. */
|
||
#define N_INCLUDE_FROM (-21)
|
||
#define N_OM_SEPARATOR (-22)
|
||
#define N_MAX_BUFSIZE (-23)
|
||
+#define N_OM_CAPTURE (-24)
|
||
|
||
static option_item optionlist[] = {
|
||
{ OP_NODATA, N_NULL, NULL, "", "terminate options" },
|
||
@@ -437,6 +440,7 @@ static option_item optionlist[] = {
|
||
{ OP_STRING, 'O', &output_text, "output=text", "show only this text (possibly expanded)" },
|
||
{ OP_OP_NUMBERS, 'o', &only_matching_data, "only-matching=n", "show only the part of the line that matched" },
|
||
{ OP_STRING, N_OM_SEPARATOR, &om_separator, "om-separator=text", "set separator for multiple -o output" },
|
||
+ { OP_U32NUMBER, N_OM_CAPTURE, &capture_max, "om-capture=n", "set capture count for --only-matching" },
|
||
{ OP_NODATA, 'q', NULL, "quiet", "suppress output, just set return code" },
|
||
{ OP_NODATA, 'r', NULL, "recursive", "recursively scan sub-directories" },
|
||
{ OP_PATLIST, N_EXCLUDE,&exclude_patdata, "exclude=pattern","exclude matching files when recursing" },
|
||
@@ -2568,7 +2572,7 @@ while (ptr < endptr)
|
||
|
||
for (i = 0; i < jfriedl_XR; i++)
|
||
match = (pcre_exec(patterns->compiled, patterns->hint, ptr, length, 0,
|
||
- PCRE2_NOTEMPTY, offsets, OFFSET_SIZE) >= 0);
|
||
+ PCRE2_NOTEMPTY, offsets, offset_size) >= 0);
|
||
|
||
if (gettimeofday(&end_time, &dummy) != 0)
|
||
perror("bad gettimeofday");
|
||
@@ -2688,7 +2692,7 @@ while (ptr < endptr)
|
||
for (om = only_matching; om != NULL; om = om->next)
|
||
{
|
||
int n = om->groupnum;
|
||
- if (n < mrc)
|
||
+ if (n == 0 || n < mrc)
|
||
{
|
||
int plen = offsets[2*n + 1] - offsets[2*n];
|
||
if (plen > 0)
|
||
@@ -3639,6 +3643,7 @@ int rc = 1;
|
||
BOOL only_one_at_top;
|
||
patstr *cp;
|
||
fnstr *fn;
|
||
+omstr *om;
|
||
const char *locale_from = "--locale";
|
||
|
||
#ifdef SUPPORT_PCRE2GREP_JIT
|
||
@@ -3655,20 +3660,6 @@ must use STDOUT_NL to terminate lines. */
|
||
_setmode(_fileno(stdout), _O_BINARY);
|
||
#endif
|
||
|
||
-/* Set up a default compile and match contexts and a match data block. */
|
||
-
|
||
-compile_context = pcre2_compile_context_create(NULL);
|
||
-match_context = pcre2_match_context_create(NULL);
|
||
-match_data = pcre2_match_data_create(OFFSET_SIZE, NULL);
|
||
-offsets = pcre2_get_ovector_pointer(match_data);
|
||
-
|
||
-/* If string (script) callouts are supported, set up the callout processing
|
||
-function. */
|
||
-
|
||
-#ifdef SUPPORT_PCRE2GREP_CALLOUT
|
||
-pcre2_set_callout(match_context, pcre2grep_callout, NULL);
|
||
-#endif
|
||
-
|
||
/* Process the options */
|
||
|
||
for (i = 1; i < argc; i++)
|
||
@@ -4015,12 +4006,40 @@ if (only_matching_count > 1)
|
||
pcre2grep_exit(usage(2));
|
||
}
|
||
|
||
+/* Check that there is a big enough ovector for all -o settings. */
|
||
+
|
||
+for (om = only_matching; om != NULL; om = om->next)
|
||
+ {
|
||
+ int n = om->groupnum;
|
||
+ if (n > (int)capture_max)
|
||
+ {
|
||
+ fprintf(stderr, "pcre2grep: Requested group %d cannot be captured.\n", n);
|
||
+ fprintf(stderr, "pcre2grep: Use --om-capture to increase the size of the capture vector.\n");
|
||
+ goto EXIT2;
|
||
+ }
|
||
+ }
|
||
+
|
||
/* Check the text supplied to --output for errors. */
|
||
|
||
if (output_text != NULL &&
|
||
!syntax_check_output_text((PCRE2_SPTR)output_text, FALSE))
|
||
goto EXIT2;
|
||
|
||
+/* Set up default compile and match contexts and a match data block. */
|
||
+
|
||
+offset_size = capture_max + 1;
|
||
+compile_context = pcre2_compile_context_create(NULL);
|
||
+match_context = pcre2_match_context_create(NULL);
|
||
+match_data = pcre2_match_data_create(offset_size, NULL);
|
||
+offsets = pcre2_get_ovector_pointer(match_data);
|
||
+
|
||
+/* If string (script) callouts are supported, set up the callout processing
|
||
+function. */
|
||
+
|
||
+#ifdef SUPPORT_PCRE2GREP_CALLOUT
|
||
+pcre2_set_callout(match_context, pcre2grep_callout, NULL);
|
||
+#endif
|
||
+
|
||
/* Put limits into the match data block. */
|
||
|
||
if (heap_limit != PCRE2_UNSET) pcre2_set_heap_limit(match_context, heap_limit);
|
||
diff --git a/testdata/grepoutput b/testdata/grepoutput
|
||
index 2bd69be..a9297e1 100644
|
||
--- a/testdata/grepoutput
|
||
+++ b/testdata/grepoutput
|
||
@@ -949,3 +949,10 @@ RC=0
|
||
---------------------------- Test 126 -----------------------------
|
||
ABC |