From e29388de53ea3a4f9d1c6b4932613681493ac9dc Mon Sep 17 00:00:00 2001 From: ph10 Date: Sat, 15 Jun 2019 15:51:07 +0000 Subject: [PATCH] Fix pcre2grep -o bug when ovector overflows; add option to adjust the limit; raise the default limit; give error if -o requests an uncaptured parens. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@1106 6239d852-aaf2-0410-a92c-79f79f948069 Petr Písař: Ported to 10.33. Signed-off-by: Petr Písař --- RunGrepTest | 7 ++++++ doc/html/pcre2api.html | 12 +++++----- doc/html/pcre2grep.html | 28 +++++++++++++++------- doc/html/pcre2test.html | 4 +++- doc/pcre2grep.1 | 26 +++++++++++++------- doc/pcre2grep.txt | 43 ++++++++++++++++++++------------- doc/pcre2test.txt | 4 +++- src/pcre2grep.c | 53 ++++++++++++++++++++++++++++------------- testdata/grepoutput | 7 ++++++ 9 files changed, 126 insertions(+), 58 deletions(-) diff --git a/RunGrepTest b/RunGrepTest index bac1f1b..ea37f70 100755 --- a/RunGrepTest +++ b/RunGrepTest @@ -653,6 +653,13 @@ printf 'ABC\0XYZ\nABCDEF\nDEFABC\n' >testtemp2grep $valgrind $vjs $pcre2grep -a -f testtemp1grep testtemp2grep >>testtrygrep echo "RC=$?" >>testtrygrep +echo "---------------------------- Test 127 -----------------------------" >>testtrygrep +(cd $srcdir; $valgrind $vjs $pcre2grep -o --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep +echo "RC=$?" >>testtrygrep + +echo "---------------------------- Test 128 -----------------------------" >>testtrygrep +(cd $srcdir; $valgrind $vjs $pcre2grep -o1 --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep 2>&1 +echo "RC=$?" >>testtrygrep # Now compare the results. diff --git a/doc/html/pcre2api.html b/doc/html/pcre2api.html index 7ca39f5..84f4442 100644 --- a/doc/html/pcre2api.html +++ b/doc/html/pcre2api.html @@ -2252,12 +2252,12 @@ segment. PCRE2_INFO_MINLENGTH If a minimum length for matching subject strings was computed, its value is -returned. Otherwise the returned value is 0. The value is a number of -characters, which in UTF mode may be different from the number of code units. -The third argument should point to an uint32_t variable. The value is a -lower bound to the length of any matching string. There may not be any strings -of that length that do actually match, but every string that does match is at -least that long. +returned. Otherwise the returned value is 0. This value is not computed when +PCRE2_NO_START_OPTIMIZE is set. The value is a number of characters, which in +UTF mode may be different from the number of code units. The third argument +should point to an uint32_t variable. The value is a lower bound to the +length of any matching string. There may not be any strings of that length that +do actually match, but every string that does match is at least that long.
   PCRE2_INFO_NAMECOUNT
   PCRE2_INFO_NAMEENTRYSIZE
diff --git a/doc/html/pcre2grep.html b/doc/html/pcre2grep.html
index d66cee3..de699e7 100644
--- a/doc/html/pcre2grep.html
+++ b/doc/html/pcre2grep.html
@@ -685,20 +685,32 @@ otherwise empty line. This option is mutually exclusive with --output,
 

-onumber, --only-matching=number Show only the part of the line that matched the capturing parentheses of the -given number. Up to 32 capturing parentheses are supported, and -o0 is -equivalent to -o without a number. Because these options can be given -without an argument (see above), if an argument is present, it must be given in -the same shell item, for example, -o3 or --only-matching=2. The comments given -for the non-argument case above also apply to this option. If the specified -capturing parentheses do not exist in the pattern, or were not set in the -match, nothing is output unless the file name or line number are being output. +given number. Up to 50 capturing parentheses are supported by default. This +limit can be changed via the --om-capture option. A pattern may contain +any number of capturing parentheses, but only those whose number is within the +limit can be accessed by -o. An error occurs if the number specified by +-o is greater than the limit. +
+
+-o0 is the same as -o without a number. Because these options can be +given without an argument (see above), if an argument is present, it must be +given in the same shell item, for example, -o3 or --only-matching=2. The +comments given for the non-argument case above also apply to this option. If +the specified capturing parentheses do not exist in the pattern, or were not +set in the match, nothing is output unless the file name or line number are +being output.

If this option is given multiple times, multiple substrings are output for each match, in the order the options are given, and all on one line. For example, -o3 -o1 -o3 causes the substrings matched by capturing parentheses 3 and 1 and then 3 again to be output. By default, there is no separator (but see the next -option). +but one option). +

+

+--om-capture=number +Set the number of capturing parentheses that can be accessed by -o. The +default is 50.

--om-separator=text diff --git a/doc/html/pcre2test.html b/doc/html/pcre2test.html index 083d5cc..4be47c6 100644 --- a/doc/html/pcre2test.html +++ b/doc/html/pcre2test.html @@ -738,7 +738,9 @@ options, the line is omitted. "First code unit" is where any match must start; if there is more than one they are listed as "starting code units". "Last code unit" is the last literal code unit that must be present in any match. This is not necessarily the last character. These lines are omitted if no starting or -ending code units are recorded. +ending code units are recorded. The subject length line is omitted when +no_start_optimize is set because the minimum length is not calculated +when it can never be used.

The framesize modifier shows the size, in bytes, of the storage frames diff --git a/doc/pcre2grep.1 b/doc/pcre2grep.1 index 6b3219b..1dcdb68 100644 --- a/doc/pcre2grep.1 +++ b/doc/pcre2grep.1 @@ -596,19 +596,29 @@ otherwise empty line. This option is mutually exclusive with \fB--output\fP, .TP \fB-o\fP\fInumber\fP, \fB--only-matching\fP=\fInumber\fP Show only the part of the line that matched the capturing parentheses of the -given number. Up to 32 capturing parentheses are supported, and -o0 is -equivalent to \fB-o\fP without a number. Because these options can be given -without an argument (see above), if an argument is present, it must be given in -the same shell item, for example, -o3 or --only-matching=2. The comments given -for the non-argument case above also apply to this option. If the specified -capturing parentheses do not exist in the pattern, or were not set in the -match, nothing is output unless the file name or line number are being output. +given number. Up to 50 capturing parentheses are supported by default. This +limit can be changed via the \fB--om-capture\fP option. A pattern may contain +any number of capturing parentheses, but only those whose number is within the +limit can be accessed by \fB-o\fP. An error occurs if the number specified by +\fB-o\fP is greater than the limit. +.sp +-o0 is the same as \fB-o\fP without a number. Because these options can be +given without an argument (see above), if an argument is present, it must be +given in the same shell item, for example, -o3 or --only-matching=2. The +comments given for the non-argument case above also apply to this option. If +the specified capturing parentheses do not exist in the pattern, or were not +set in the match, nothing is output unless the file name or line number are +being output. .sp If this option is given multiple times, multiple substrings are output for each match, in the order the options are given, and all on one line. For example, -o3 -o1 -o3 causes the substrings matched by capturing parentheses 3 and 1 and then 3 again to be output. By default, there is no separator (but see the next -option). +but one option). +.TP +\fB--om-capture\fP=\fInumber\fP +Set the number of capturing parentheses that can be accessed by \fB-o\fP. The +default is 50. .TP \fB--om-separator\fP=\fItext\fP Specify a separating string for multiple occurrences of \fB-o\fP. The default diff --git a/doc/pcre2grep.txt b/doc/pcre2grep.txt index cd44fe0..2920643 100644 --- a/doc/pcre2grep.txt +++ b/doc/pcre2grep.txt @@ -662,23 +662,32 @@ OPTIONS -onumber, --only-matching=number Show only the part of the line that matched the capturing - parentheses of the given number. Up to 32 capturing parenthe- - ses are supported, and -o0 is equivalent to -o without a num- - ber. Because these options can be given without an argument - (see above), if an argument is present, it must be given in - the same shell item, for example, -o3 or --only-matching=2. - The comments given for the non-argument case above also apply - to this option. If the specified capturing parentheses do not - exist in the pattern, or were not set in the match, nothing - is output unless the file name or line number are being out- - put. - - If this option is given multiple times, multiple substrings - are output for each match, in the order the options are - given, and all on one line. For example, -o3 -o1 -o3 causes - the substrings matched by capturing parentheses 3 and 1 and - then 3 again to be output. By default, there is no separator - (but see the next option). + parentheses of the given number. Up to 50 capturing parenthe- + ses are supported by default. This limit can be changed via + the --om-capture option. A pattern may contain any number of + capturing parentheses, but only those whose number is within + the limit can be accessed by -o. An error occurs if the num- + ber specified by -o is greater than the limit. + + -o0 is the same as -o without a number. Because these options + can be given without an argument (see above), if an argument + is present, it must be given in the same shell item, for + example, -o3 or --only-matching=2. The comments given for the + non-argument case above also apply to this option. If the + specified capturing parentheses do not exist in the pattern, + or were not set in the match, nothing is output unless the + file name or line number are being output. + + If this option is given multiple times, multiple substrings + are output for each match, in the order the options are + given, and all on one line. For example, -o3 -o1 -o3 causes + the substrings matched by capturing parentheses 3 and 1 and + then 3 again to be output. By default, there is no separator + (but see the next but one option). + + --om-capture=number + Set the number of capturing parentheses that can be accessed + by -o. The default is 50. --om-separator=text Specify a separating string for multiple occurrences of -o. diff --git a/doc/pcre2test.txt b/doc/pcre2test.txt index cbe3528..f287f6d 100644 --- a/doc/pcre2test.txt +++ b/doc/pcre2test.txt @@ -669,7 +669,9 @@ PATTERN MODIFIERS as "starting code units". "Last code unit" is the last literal code unit that must be present in any match. This is not necessarily the last character. These lines are omitted if no starting or ending code - units are recorded. + units are recorded. The subject length line is omitted when + no_start_optimize is set because the minimum length is not calculated + when it can never be used. The framesize modifier shows the size, in bytes, of the storage frames used by pcre2_match() for handling backtracking. The size depends on diff --git a/src/pcre2grep.c b/src/pcre2grep.c index a3cc3ec..d17cd2a 100644 --- a/src/pcre2grep.c +++ b/src/pcre2grep.c @@ -115,7 +115,7 @@ MSVC 10/2010. Except for VC6 (which is missing some fundamentals and fails). */ typedef int BOOL; -#define OFFSET_SIZE 33 +#define DEFAULT_CAPTURE_MAX 50 #if BUFSIZ > 8192 #define MAXPATLEN BUFSIZ @@ -242,6 +242,8 @@ static pcre2_compile_context *compile_context; static pcre2_match_context *match_context; static pcre2_match_data *match_data; static PCRE2_SIZE *offsets; +static uint32_t offset_size; +static uint32_t capture_max = DEFAULT_CAPTURE_MAX; static BOOL count_only = FALSE; static BOOL do_colour = FALSE; @@ -391,6 +393,7 @@ used to identify them. */ #define N_INCLUDE_FROM (-21) #define N_OM_SEPARATOR (-22) #define N_MAX_BUFSIZE (-23) +#define N_OM_CAPTURE (-24) static option_item optionlist[] = { { OP_NODATA, N_NULL, NULL, "", "terminate options" }, @@ -437,6 +440,7 @@ static option_item optionlist[] = { { OP_STRING, 'O', &output_text, "output=text", "show only this text (possibly expanded)" }, { OP_OP_NUMBERS, 'o', &only_matching_data, "only-matching=n", "show only the part of the line that matched" }, { OP_STRING, N_OM_SEPARATOR, &om_separator, "om-separator=text", "set separator for multiple -o output" }, + { OP_U32NUMBER, N_OM_CAPTURE, &capture_max, "om-capture=n", "set capture count for --only-matching" }, { OP_NODATA, 'q', NULL, "quiet", "suppress output, just set return code" }, { OP_NODATA, 'r', NULL, "recursive", "recursively scan sub-directories" }, { OP_PATLIST, N_EXCLUDE,&exclude_patdata, "exclude=pattern","exclude matching files when recursing" }, @@ -2568,7 +2572,7 @@ while (ptr < endptr) for (i = 0; i < jfriedl_XR; i++) match = (pcre_exec(patterns->compiled, patterns->hint, ptr, length, 0, - PCRE2_NOTEMPTY, offsets, OFFSET_SIZE) >= 0); + PCRE2_NOTEMPTY, offsets, offset_size) >= 0); if (gettimeofday(&end_time, &dummy) != 0) perror("bad gettimeofday"); @@ -2688,7 +2692,7 @@ while (ptr < endptr) for (om = only_matching; om != NULL; om = om->next) { int n = om->groupnum; - if (n < mrc) + if (n == 0 || n < mrc) { int plen = offsets[2*n + 1] - offsets[2*n]; if (plen > 0) @@ -3639,6 +3643,7 @@ int rc = 1; BOOL only_one_at_top; patstr *cp; fnstr *fn; +omstr *om; const char *locale_from = "--locale"; #ifdef SUPPORT_PCRE2GREP_JIT @@ -3655,20 +3660,6 @@ must use STDOUT_NL to terminate lines. */ _setmode(_fileno(stdout), _O_BINARY); #endif -/* Set up a default compile and match contexts and a match data block. */ - -compile_context = pcre2_compile_context_create(NULL); -match_context = pcre2_match_context_create(NULL); -match_data = pcre2_match_data_create(OFFSET_SIZE, NULL); -offsets = pcre2_get_ovector_pointer(match_data); - -/* If string (script) callouts are supported, set up the callout processing -function. */ - -#ifdef SUPPORT_PCRE2GREP_CALLOUT -pcre2_set_callout(match_context, pcre2grep_callout, NULL); -#endif - /* Process the options */ for (i = 1; i < argc; i++) @@ -4015,12 +4006,40 @@ if (only_matching_count > 1) pcre2grep_exit(usage(2)); } +/* Check that there is a big enough ovector for all -o settings. */ + +for (om = only_matching; om != NULL; om = om->next) + { + int n = om->groupnum; + if (n > (int)capture_max) + { + fprintf(stderr, "pcre2grep: Requested group %d cannot be captured.\n", n); + fprintf(stderr, "pcre2grep: Use --om-capture to increase the size of the capture vector.\n"); + goto EXIT2; + } + } + /* Check the text supplied to --output for errors. */ if (output_text != NULL && !syntax_check_output_text((PCRE2_SPTR)output_text, FALSE)) goto EXIT2; +/* Set up default compile and match contexts and a match data block. */ + +offset_size = capture_max + 1; +compile_context = pcre2_compile_context_create(NULL); +match_context = pcre2_match_context_create(NULL); +match_data = pcre2_match_data_create(offset_size, NULL); +offsets = pcre2_get_ovector_pointer(match_data); + +/* If string (script) callouts are supported, set up the callout processing +function. */ + +#ifdef SUPPORT_PCRE2GREP_CALLOUT +pcre2_set_callout(match_context, pcre2grep_callout, NULL); +#endif + /* Put limits into the match data block. */ if (heap_limit != PCRE2_UNSET) pcre2_set_heap_limit(match_context, heap_limit); diff --git a/testdata/grepoutput b/testdata/grepoutput index 2bd69be..a9297e1 100644 --- a/testdata/grepoutput +++ b/testdata/grepoutput @@ -949,3 +949,10 @@ RC=0 ---------------------------- Test 126 ----------------------------- ABCXYZ RC=0 +---------------------------- Test 127 ----------------------------- +pattern +RC=0 +---------------------------- Test 128 ----------------------------- +pcre2grep: Requested group 1 cannot be captured. +pcre2grep: Use --om-capture to increase the size of the capture vector. +RC=2 -- 2.20.1