From 23be49cbbfe0608c3ced9345e8e1d64bf9576c04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaroslav=20=C5=A0karvada?= Date: Fri, 14 Nov 2014 17:28:18 +0100 Subject: [PATCH] Backported more PCRE fixes (by pcre-backported-fixes patch) - Dropped pcre-invalid-utf8-fix patch, handled by pcre-backported-fixes patch --- grep-2.20-pcre-backported-fixes.patch | 389 ++++++++++++++++++++++++++ grep-2.20-pcre-invalid-utf8-fix.patch | 136 --------- grep.spec | 10 +- 3 files changed, 396 insertions(+), 139 deletions(-) create mode 100644 grep-2.20-pcre-backported-fixes.patch delete mode 100644 grep-2.20-pcre-invalid-utf8-fix.patch diff --git a/grep-2.20-pcre-backported-fixes.patch b/grep-2.20-pcre-backported-fixes.patch new file mode 100644 index 0000000..4a9dbcd --- /dev/null +++ b/grep-2.20-pcre-backported-fixes.patch @@ -0,0 +1,389 @@ +diff --git a/src/grep.h b/src/grep.h +index 4935872..729c906 100644 +--- a/src/grep.h ++++ b/src/grep.h +@@ -27,4 +27,19 @@ extern int match_words; /* -w */ + extern int match_lines; /* -x */ + extern unsigned char eolbyte; /* -z */ + ++/* An enum textbin describes the file's type, inferred from data read ++ before the first line is selected for output. */ ++enum textbin ++ { ++ /* Binary, as it contains null bytes and the -z option is not in effect, ++ or it contains encoding errors. */ ++ TEXTBIN_BINARY = -1, ++ ++ /* Not known yet. Only text has been seen so far. */ ++ TEXTBIN_UNKNOWN = 0, ++ ++ /* Text. */ ++ TEXTBIN_TEXT = 1 ++ }; ++ + #endif +diff --git a/src/pcresearch.c b/src/pcresearch.c +index 820dd00..9938ffc 100644 +--- a/src/pcresearch.c ++++ b/src/pcresearch.c +@@ -33,13 +33,19 @@ static pcre *cre; + /* Additional information about the pattern. */ + static pcre_extra *extra; + +-# ifdef PCRE_STUDY_JIT_COMPILE +-static pcre_jit_stack *jit_stack; +-# else ++# ifndef PCRE_STUDY_JIT_COMPILE + # define PCRE_STUDY_JIT_COMPILE 0 + # endif + #endif + ++/* Table, indexed by ! (flag & PCRE_NOTBOL), of whether the empty ++ string matches when that flag is used. */ ++static int empty_match[2]; ++ ++/* This must be at least 2; everything after that is for performance ++ in pcre_exec. */ ++enum { NSUB = 300 }; ++ + void + Pcompile (char const *pattern, size_t size) + { +@@ -52,13 +58,17 @@ Pcompile (char const *pattern, size_t size) + char const *ep; + char *re = xnmalloc (4, size + 7); + int flags = (PCRE_MULTILINE +- | (match_icase ? PCRE_CASELESS : 0) +- | (using_utf8 () ? PCRE_UTF8 : 0)); ++ | (match_icase ? PCRE_CASELESS : 0)); + char const *patlim = pattern + size; + char *n = re; + char const *p; + char const *pnul; + ++ if (using_utf8 ()) ++ flags |= PCRE_UTF8; ++ else if (MB_CUR_MAX != 1) ++ error (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales")); ++ + /* FIXME: Remove these restrictions. */ + if (memchr (pattern, '\n', size)) + error (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern")); +@@ -114,14 +124,20 @@ Pcompile (char const *pattern, size_t size) + /* A 32K stack is allocated for the machine code by default, which + can grow to 512K if necessary. Since JIT uses far less memory + than the interpreter, this should be enough in practice. */ +- jit_stack = pcre_jit_stack_alloc (32 * 1024, 512 * 1024); ++ pcre_jit_stack *jit_stack = pcre_jit_stack_alloc (32 * 1024, 512 * 1024); + if (!jit_stack) + error (EXIT_TROUBLE, 0, + _("failed to allocate memory for the PCRE JIT stack")); + pcre_assign_jit_stack (extra, NULL, jit_stack); + } ++ + # endif + free (re); ++ ++ int sub[NSUB]; ++ empty_match[false] = pcre_exec (cre, extra, "", 0, 0, ++ PCRE_NOTBOL, sub, NSUB); ++ empty_match[true] = pcre_exec (cre, extra, "", 0, 0, 0, sub, NSUB); + #endif /* HAVE_LIBPCRE */ + } + +@@ -134,36 +150,110 @@ Pexecute (char const *buf, size_t size, size_t *match_size, + error (EXIT_TROUBLE, 0, _("internal error")); + return -1; + #else +- /* This array must have at least two elements; everything after that +- is just for performance improvement in pcre_exec. */ +- int sub[300]; +- +- const char *line_buf, *line_end, *line_next; ++ int sub[NSUB]; ++ char const *p = start_ptr ? start_ptr : buf; ++ bool bol = p[-1] == eolbyte; ++ char const *line_start = buf; + int e = PCRE_ERROR_NOMATCH; +- ptrdiff_t start_ofs = start_ptr ? start_ptr - buf : 0; ++ char const *line_end; + +- /* PCRE can't limit the matching to single lines, therefore we have to +- match each line in the buffer separately. */ +- for (line_next = buf; +- e == PCRE_ERROR_NOMATCH && line_next < buf + size; +- start_ofs -= line_next - line_buf) ++ /* If the input type is unknown, the caller is still testing the ++ input, which means the current buffer cannot contain encoding ++ errors and a multiline search is typically more efficient. ++ Otherwise, a single-line search is typically faster, so that ++ pcre_exec doesn't waste time validating the entire input ++ buffer. */ ++ bool multiline = TEXTBIN_UNKNOWN; ++ ++ for (; p < buf + size; p = line_start = line_end + 1) + { +- line_buf = line_next; +- line_end = memchr (line_buf, eolbyte, (buf + size) - line_buf); +- if (line_end == NULL) +- line_next = line_end = buf + size; +- else +- line_next = line_end + 1; ++ bool too_big; + +- if (start_ptr && start_ptr >= line_end) +- continue; ++ if (multiline) ++ { ++ size_t pcre_size_max = MIN (INT_MAX, SIZE_MAX - 1); ++ size_t scan_size = MIN (pcre_size_max + 1, buf + size - p); ++ line_end = memrchr (p, eolbyte, scan_size); ++ too_big = ! line_end; ++ } ++ else ++ { ++ line_end = memchr (p, eolbyte, buf + size - p); ++ too_big = INT_MAX < line_end - p; ++ } + +- if (INT_MAX < line_end - line_buf) ++ if (too_big) + error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit")); + +- e = pcre_exec (cre, extra, line_buf, line_end - line_buf, +- start_ofs < 0 ? 0 : start_ofs, 0, +- sub, sizeof sub / sizeof *sub); ++ for (;;) ++ { ++ /* Skip past bytes that are easily determined to be encoding ++ errors, treating them as data that cannot match. This is ++ faster than having pcre_exec check them. */ ++ while (mbclen_cache[to_uchar (*p)] == (size_t) -1) ++ { ++ p++; ++ bol = false; ++ } ++ ++ /* Check for an empty match; this is faster than letting ++ pcre_exec do it. */ ++ int search_bytes = line_end - p; ++ if (search_bytes == 0) ++ { ++ sub[0] = sub[1] = 0; ++ e = empty_match[bol]; ++ break; ++ } ++ ++ int options = 0; ++ if (!bol) ++ options |= PCRE_NOTBOL; ++ if (multiline) ++ options |= PCRE_NO_UTF8_CHECK; ++ ++ e = pcre_exec (cre, extra, p, search_bytes, 0, ++ options, sub, NSUB); ++ if (e != PCRE_ERROR_BADUTF8) ++ { ++ if (0 < e && multiline && sub[1] - sub[0] != 0) ++ { ++ char const *nl = memchr (p + sub[0], eolbyte, ++ sub[1] - sub[0]); ++ if (nl) ++ { ++ /* This match crosses a line boundary; reject it. */ ++ p += sub[0]; ++ line_end = nl; ++ continue; ++ } ++ } ++ break; ++ } ++ int valid_bytes = sub[0]; ++ ++ /* Try to match the string before the encoding error. ++ Again, handle the empty-match case specially, for speed. */ ++ if (valid_bytes == 0) ++ { ++ sub[1] = 0; ++ e = empty_match[bol]; ++ } ++ else ++ e = pcre_exec (cre, extra, p, valid_bytes, 0, ++ options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL, ++ sub, NSUB); ++ if (e != PCRE_ERROR_NOMATCH) ++ break; ++ ++ /* Treat the encoding error as data that cannot match. */ ++ p += valid_bytes + 1; ++ bol = false; ++ } ++ ++ if (e != PCRE_ERROR_NOMATCH) ++ break; ++ bol = true; + } + + if (e <= 0) +@@ -171,7 +261,7 @@ Pexecute (char const *buf, size_t size, size_t *match_size, + switch (e) + { + case PCRE_ERROR_NOMATCH: +- return -1; ++ break; + + case PCRE_ERROR_NOMEMORY: + error (EXIT_TROUBLE, 0, _("memory exhausted")); +@@ -180,10 +270,6 @@ Pexecute (char const *buf, size_t size, size_t *match_size, + error (EXIT_TROUBLE, 0, + _("exceeded PCRE's backtracking limit")); + +- case PCRE_ERROR_BADUTF8: +- error (EXIT_TROUBLE, 0, +- _("invalid UTF-8 byte sequence in input")); +- + default: + /* For now, we lump all remaining PCRE failures into this basket. + If anyone cares to provide sample grep usage that can trigger +@@ -192,30 +278,33 @@ Pexecute (char const *buf, size_t size, size_t *match_size, + error (EXIT_TROUBLE, 0, _("internal PCRE error: %d"), e); + } + +- /* NOTREACHED */ + return -1; + } + else + { +- /* Narrow down to the line we've found. */ +- char const *beg = line_buf + sub[0]; +- char const *end = line_buf + sub[1]; +- char const *buflim = buf + size; +- char eol = eolbyte; +- if (!start_ptr) ++ char const *matchbeg = p + sub[0]; ++ char const *matchend = p + sub[1]; ++ char const *beg; ++ char const *end; ++ if (start_ptr) + { +- /* FIXME: The case when '\n' is not found indicates a bug: +- Since grep is line oriented, the match should never contain +- a newline, so there _must_ be a newline following. +- */ +- if (!(end = memchr (end, eol, buflim - end))) +- end = buflim; +- else +- end++; +- while (buf < beg && beg[-1] != eol) +- --beg; ++ beg = matchbeg; ++ end = matchend; ++ } ++ else if (multiline) ++ { ++ char const *prev_nl = memrchr (line_start - 1, eolbyte, ++ matchbeg - (line_start - 1)); ++ char const *next_nl = memchr (matchend, eolbyte, ++ line_end + 1 - matchend); ++ beg = prev_nl + 1; ++ end = next_nl + 1; ++ } ++ else ++ { ++ beg = line_start; ++ end = line_end + 1; + } +- + *match_size = end - beg; + return beg - buf; + } +diff --git a/src/search.h b/src/search.h +index 14877bc..e671bea 100644 +--- a/src/search.h ++++ b/src/search.h +@@ -45,6 +45,7 @@ extern void kwsinit (kwset_t *); + + extern char *mbtoupper (char const *, size_t *, mb_len_map_t **); + extern void build_mbclen_cache (void); ++extern size_t mbclen_cache[]; + extern ptrdiff_t mb_goback (char const **, char const *, char const *); + extern wint_t mb_prev_wc (char const *, char const *, char const *); + extern wint_t mb_next_wc (char const *, char const *); +diff --git a/src/searchutils.c b/src/searchutils.c +index 5eb9a12..aba9335 100644 +--- a/src/searchutils.c ++++ b/src/searchutils.c +@@ -22,7 +22,7 @@ + + #define NCHAR (UCHAR_MAX + 1) + +-static size_t mbclen_cache[NCHAR]; ++size_t mbclen_cache[NCHAR]; + + void + kwsinit (kwset_t *kwset) +diff --git a/tests/pcre-infloop b/tests/pcre-infloop +index 1b33e72..8054844 100755 +--- a/tests/pcre-infloop ++++ b/tests/pcre-infloop +@@ -18,16 +18,16 @@ + # along with this program. If not, see . + + . "${srcdir=.}/init.sh"; path_prepend_ ../src +-require_pcre_ + require_timeout_ + require_en_utf8_locale_ + require_compiled_in_MB_support ++LC_ALL=en_US.UTF-8 require_pcre_ + + printf 'a\201b\r' > in || framework_failure_ + + fail=0 + + LC_ALL=en_US.UTF-8 timeout 3 grep -P 'a.?..b' in +-test $? = 2 || fail_ "libpcre's match function appears to infloop" ++test $? = 1 || fail_ "libpcre's match function appears to infloop" + + Exit $fail +diff --git a/tests/pcre-invalid-utf8-input b/tests/pcre-invalid-utf8-input +index 913e8ee..abcc7e8 100755 +--- a/tests/pcre-invalid-utf8-input ++++ b/tests/pcre-invalid-utf8-input +@@ -8,14 +8,19 @@ + # notice and this notice are preserved. + + . "${srcdir=.}/init.sh"; path_prepend_ ../src +-require_pcre_ ++require_timeout_ + require_en_utf8_locale_ ++require_compiled_in_MB_support ++LC_ALL=en_US.UTF-8 require_pcre_ + + fail=0 + +-printf 'j\202\nj\n' > in || framework_failure_ ++printf 'j\202j\nj\nk\202\n' > in || framework_failure_ + +-LC_ALL=en_US.UTF-8 grep -P j in +-test $? -eq 2 || fail=1 ++LC_ALL=en_US.UTF-8 timeout 3 grep -P j in ++test $? -eq 0 || fail=1 ++ ++LC_ALL=en_US.UTF-8 timeout 3 grep -P 'k$' in ++test $? -eq 1 || fail=1 + + Exit $fail +diff --git a/tests/pcre-utf8 b/tests/pcre-utf8 +index 41676f4..2dda116 100755 +--- a/tests/pcre-utf8 ++++ b/tests/pcre-utf8 +@@ -8,8 +8,8 @@ + # notice and this notice are preserved. + + . "${srcdir=.}/init.sh"; path_prepend_ ../src +-require_pcre_ + require_en_utf8_locale_ ++LC_ALL=en_US.UTF-8 require_pcre_ + + fail=0 + diff --git a/grep-2.20-pcre-invalid-utf8-fix.patch b/grep-2.20-pcre-invalid-utf8-fix.patch deleted file mode 100644 index 5f7530f..0000000 --- a/grep-2.20-pcre-invalid-utf8-fix.patch +++ /dev/null @@ -1,136 +0,0 @@ -diff --git a/src/pcresearch.c b/src/pcresearch.c -index 820dd00..11df488 100644 ---- a/src/pcresearch.c -+++ b/src/pcresearch.c -@@ -136,34 +136,42 @@ Pexecute (char const *buf, size_t size, size_t *match_size, - #else - /* This array must have at least two elements; everything after that - is just for performance improvement in pcre_exec. */ -- int sub[300]; -+ enum { nsub = 300 }; -+ int sub[nsub]; - -- const char *line_buf, *line_end, *line_next; -+ char const *p = start_ptr ? start_ptr : buf; -+ int options = p == buf || p[-1] == eolbyte ? 0 : PCRE_NOTBOL; -+ char const *line_start = buf; - int e = PCRE_ERROR_NOMATCH; -- ptrdiff_t start_ofs = start_ptr ? start_ptr - buf : 0; -+ char const *line_end; - - /* PCRE can't limit the matching to single lines, therefore we have to - match each line in the buffer separately. */ -- for (line_next = buf; -- e == PCRE_ERROR_NOMATCH && line_next < buf + size; -- start_ofs -= line_next - line_buf) -+ for (; p < buf + size; p = line_start = line_end + 1) - { -- line_buf = line_next; -- line_end = memchr (line_buf, eolbyte, (buf + size) - line_buf); -- if (line_end == NULL) -- line_next = line_end = buf + size; -- else -- line_next = line_end + 1; -- -- if (start_ptr && start_ptr >= line_end) -- continue; -+ line_end = memchr (p, eolbyte, buf + size - p); - -- if (INT_MAX < line_end - line_buf) -+ if (INT_MAX < line_end - p) - error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit")); - -- e = pcre_exec (cre, extra, line_buf, line_end - line_buf, -- start_ofs < 0 ? 0 : start_ofs, 0, -- sub, sizeof sub / sizeof *sub); -+ /* Treat encoding-error bytes as data that cannot match. */ -+ for (;;) -+ { -+ e = pcre_exec (cre, extra, p, line_end - p, 0, options, sub, nsub); -+ if (e != PCRE_ERROR_BADUTF8) -+ break; -+ e = pcre_exec (cre, extra, p, sub[0], 0, -+ options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL, -+ sub, nsub); -+ if (e != PCRE_ERROR_NOMATCH) -+ break; -+ p += sub[0] + 1; -+ options = PCRE_NOTBOL; -+ } -+ -+ if (e != PCRE_ERROR_NOMATCH) -+ break; -+ options = 0; - } - - if (e <= 0) -@@ -180,10 +188,6 @@ Pexecute (char const *buf, size_t size, size_t *match_size, - error (EXIT_TROUBLE, 0, - _("exceeded PCRE's backtracking limit")); - -- case PCRE_ERROR_BADUTF8: -- error (EXIT_TROUBLE, 0, -- _("invalid UTF-8 byte sequence in input")); -- - default: - /* For now, we lump all remaining PCRE failures into this basket. - If anyone cares to provide sample grep usage that can trigger -@@ -197,25 +201,8 @@ Pexecute (char const *buf, size_t size, size_t *match_size, - } - else - { -- /* Narrow down to the line we've found. */ -- char const *beg = line_buf + sub[0]; -- char const *end = line_buf + sub[1]; -- char const *buflim = buf + size; -- char eol = eolbyte; -- if (!start_ptr) -- { -- /* FIXME: The case when '\n' is not found indicates a bug: -- Since grep is line oriented, the match should never contain -- a newline, so there _must_ be a newline following. -- */ -- if (!(end = memchr (end, eol, buflim - end))) -- end = buflim; -- else -- end++; -- while (buf < beg && beg[-1] != eol) -- --beg; -- } -- -+ char const *beg = start_ptr ? p + sub[0] : line_start; -+ char const *end = start_ptr ? p + sub[1] : line_end + 1; - *match_size = end - beg; - return beg - buf; - } -diff --git a/tests/pcre-infloop b/tests/pcre-infloop -index 1b33e72..b92f8e1 100755 ---- a/tests/pcre-infloop -+++ b/tests/pcre-infloop -@@ -28,6 +28,6 @@ printf 'a\201b\r' > in || framework_failure_ - fail=0 - - LC_ALL=en_US.UTF-8 timeout 3 grep -P 'a.?..b' in --test $? = 2 || fail_ "libpcre's match function appears to infloop" -+test $? = 1 || fail_ "libpcre's match function appears to infloop" - - Exit $fail -diff --git a/tests/pcre-invalid-utf8-input b/tests/pcre-invalid-utf8-input -index 913e8ee..9da4b18 100755 ---- a/tests/pcre-invalid-utf8-input -+++ b/tests/pcre-invalid-utf8-input -@@ -13,9 +13,12 @@ require_en_utf8_locale_ - - fail=0 - --printf 'j\202\nj\n' > in || framework_failure_ -+printf 'j\202j\nj\nk\202\n' > in || framework_failure_ - - LC_ALL=en_US.UTF-8 grep -P j in --test $? -eq 2 || fail=1 -+test $? -eq 0 || fail=1 -+ -+LC_ALL=en_US.UTF-8 grep -P 'k$' in -+test $? -eq 1 || fail=1 - - Exit $fail diff --git a/grep.spec b/grep.spec index 1784194..f76f13b 100644 --- a/grep.spec +++ b/grep.spec @@ -3,7 +3,7 @@ Summary: Pattern matching utilities Name: grep Version: 2.20 -Release: 6%{?dist} +Release: 7%{?dist} License: GPLv3+ Group: Applications/Text Source: ftp://ftp.gnu.org/pub/gnu/grep/grep-%{version}.tar.xz @@ -16,7 +16,7 @@ Patch0: grep-2.20-man-fix-gs.patch # upstream ticket 39445 Patch1: grep-2.20-help-align.patch # backported from upstream -Patch2: grep-2.20-pcre-invalid-utf8-fix.patch +Patch2: grep-2.20-pcre-backported-fixes.patch URL: http://www.gnu.org/software/grep/ Requires(post): /sbin/install-info Requires(preun): /sbin/install-info @@ -37,7 +37,7 @@ GNU grep is needed by many scripts, so it shall be installed on every system. %setup -q %patch0 -p1 -b .man-fix-gs %patch1 -p1 -b .help-align -%patch2 -p1 -b .pcre-invalid-utf8-fix +%patch2 -p1 -b .pcre-backported-fixes %build %global BUILD_FLAGS $RPM_OPT_FLAGS @@ -93,6 +93,10 @@ fi %{_libexecdir}/grepconf.sh %changelog +* Fri Nov 14 2014 Jaroslav Škarvada - 2.20-7 +- Backported more PCRE fixes (by pcre-backported-fixes patch) +- Dropped pcre-invalid-utf8-fix patch, handled by pcre-backported-fixes patch + * Tue Nov 11 2014 Jaroslav Škarvada - 2.20-6 - Fixed invalid UTF-8 byte sequence error in PCRE mode (by pcre-invalid-utf8-fix patch)