From 23be49cbbfe0608c3ced9345e8e1d64bf9576c04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jaroslav=20=C5=A0karvada?= <jskarvad@redhat.com>
Date: Fri, 14 Nov 2014 17:28:18 +0100
Subject: [PATCH] Backported more PCRE fixes (by pcre-backported-fixes patch)

- Dropped pcre-invalid-utf8-fix patch, handled by pcre-backported-fixes patch
---
 grep-2.20-pcre-backported-fixes.patch | 389 ++++++++++++++++++++++++++
 grep-2.20-pcre-invalid-utf8-fix.patch | 136 ---------
 grep.spec                             |  10 +-
 3 files changed, 396 insertions(+), 139 deletions(-)
 create mode 100644 grep-2.20-pcre-backported-fixes.patch
 delete mode 100644 grep-2.20-pcre-invalid-utf8-fix.patch

diff --git a/grep-2.20-pcre-backported-fixes.patch b/grep-2.20-pcre-backported-fixes.patch
new file mode 100644
index 0000000..4a9dbcd
--- /dev/null
+++ b/grep-2.20-pcre-backported-fixes.patch
@@ -0,0 +1,389 @@
+diff --git a/src/grep.h b/src/grep.h
+index 4935872..729c906 100644
+--- a/src/grep.h
++++ b/src/grep.h
+@@ -27,4 +27,19 @@ extern int match_words;		/* -w */
+ extern int match_lines;		/* -x */
+ extern unsigned char eolbyte;	/* -z */
+ 
++/* An enum textbin describes the file's type, inferred from data read
++   before the first line is selected for output.  */
++enum textbin
++  {
++    /* Binary, as it contains null bytes and the -z option is not in effect,
++       or it contains encoding errors.  */
++    TEXTBIN_BINARY = -1,
++
++    /* Not known yet.  Only text has been seen so far.  */
++    TEXTBIN_UNKNOWN = 0,
++
++    /* Text.  */
++    TEXTBIN_TEXT = 1
++  };
++
+ #endif
+diff --git a/src/pcresearch.c b/src/pcresearch.c
+index 820dd00..9938ffc 100644
+--- a/src/pcresearch.c
++++ b/src/pcresearch.c
+@@ -33,13 +33,19 @@ static pcre *cre;
+ /* Additional information about the pattern.  */
+ static pcre_extra *extra;
+ 
+-# ifdef PCRE_STUDY_JIT_COMPILE
+-static pcre_jit_stack *jit_stack;
+-# else
++# ifndef PCRE_STUDY_JIT_COMPILE
+ #  define PCRE_STUDY_JIT_COMPILE 0
+ # endif
+ #endif
+ 
++/* Table, indexed by ! (flag & PCRE_NOTBOL), of whether the empty
++   string matches when that flag is used.  */
++static int empty_match[2];
++
++/* This must be at least 2; everything after that is for performance
++   in pcre_exec.  */
++enum { NSUB = 300 };
++
+ void
+ Pcompile (char const *pattern, size_t size)
+ {
+@@ -52,13 +58,17 @@ Pcompile (char const *pattern, size_t size)
+   char const *ep;
+   char *re = xnmalloc (4, size + 7);
+   int flags = (PCRE_MULTILINE
+-               | (match_icase ? PCRE_CASELESS : 0)
+-               | (using_utf8 () ? PCRE_UTF8 : 0));
++               | (match_icase ? PCRE_CASELESS : 0));
+   char const *patlim = pattern + size;
+   char *n = re;
+   char const *p;
+   char const *pnul;
+ 
++  if (using_utf8 ())
++    flags |= PCRE_UTF8;
++  else if (MB_CUR_MAX != 1)
++    error (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
++
+   /* FIXME: Remove these restrictions.  */
+   if (memchr (pattern, '\n', size))
+     error (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern"));
+@@ -114,14 +124,20 @@ Pcompile (char const *pattern, size_t size)
+       /* A 32K stack is allocated for the machine code by default, which
+          can grow to 512K if necessary. Since JIT uses far less memory
+          than the interpreter, this should be enough in practice.  */
+-      jit_stack = pcre_jit_stack_alloc (32 * 1024, 512 * 1024);
++      pcre_jit_stack *jit_stack = pcre_jit_stack_alloc (32 * 1024, 512 * 1024);
+       if (!jit_stack)
+         error (EXIT_TROUBLE, 0,
+                _("failed to allocate memory for the PCRE JIT stack"));
+       pcre_assign_jit_stack (extra, NULL, jit_stack);
+     }
++
+ # endif
+   free (re);
++
++  int sub[NSUB];
++  empty_match[false] = pcre_exec (cre, extra, "", 0, 0,
++                                  PCRE_NOTBOL, sub, NSUB);
++  empty_match[true] = pcre_exec (cre, extra, "", 0, 0, 0, sub, NSUB);
+ #endif /* HAVE_LIBPCRE */
+ }
+ 
+@@ -134,36 +150,110 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
+   error (EXIT_TROUBLE, 0, _("internal error"));
+   return -1;
+ #else
+-  /* This array must have at least two elements; everything after that
+-     is just for performance improvement in pcre_exec.  */
+-  int sub[300];
+-
+-  const char *line_buf, *line_end, *line_next;
++  int sub[NSUB];
++  char const *p = start_ptr ? start_ptr : buf;
++  bool bol = p[-1] == eolbyte;
++  char const *line_start = buf;
+   int e = PCRE_ERROR_NOMATCH;
+-  ptrdiff_t start_ofs = start_ptr ? start_ptr - buf : 0;
++  char const *line_end;
+ 
+-  /* PCRE can't limit the matching to single lines, therefore we have to
+-     match each line in the buffer separately.  */
+-  for (line_next = buf;
+-       e == PCRE_ERROR_NOMATCH && line_next < buf + size;
+-       start_ofs -= line_next - line_buf)
++  /* If the input type is unknown, the caller is still testing the
++     input, which means the current buffer cannot contain encoding
++     errors and a multiline search is typically more efficient.
++     Otherwise, a single-line search is typically faster, so that
++     pcre_exec doesn't waste time validating the entire input
++     buffer.  */
++  bool multiline = TEXTBIN_UNKNOWN;
++
++  for (; p < buf + size; p = line_start = line_end + 1)
+     {
+-      line_buf = line_next;
+-      line_end = memchr (line_buf, eolbyte, (buf + size) - line_buf);
+-      if (line_end == NULL)
+-        line_next = line_end = buf + size;
+-      else
+-        line_next = line_end + 1;
++      bool too_big;
+ 
+-      if (start_ptr && start_ptr >= line_end)
+-        continue;
++      if (multiline)
++        {
++          size_t pcre_size_max = MIN (INT_MAX, SIZE_MAX - 1);
++          size_t scan_size = MIN (pcre_size_max + 1, buf + size - p);
++          line_end = memrchr (p, eolbyte, scan_size);
++          too_big = ! line_end;
++        }
++      else
++        {
++          line_end = memchr (p, eolbyte, buf + size - p);
++          too_big = INT_MAX < line_end - p;
++        }
+ 
+-      if (INT_MAX < line_end - line_buf)
++      if (too_big)
+         error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
+ 
+-      e = pcre_exec (cre, extra, line_buf, line_end - line_buf,
+-                     start_ofs < 0 ? 0 : start_ofs, 0,
+-                     sub, sizeof sub / sizeof *sub);
++      for (;;)
++        {
++          /* Skip past bytes that are easily determined to be encoding
++             errors, treating them as data that cannot match.  This is
++             faster than having pcre_exec check them.  */
++          while (mbclen_cache[to_uchar (*p)] == (size_t) -1)
++            {
++              p++;
++              bol = false;
++            }
++
++          /* Check for an empty match; this is faster than letting
++             pcre_exec do it.  */
++          int search_bytes = line_end - p;
++          if (search_bytes == 0)
++            {
++              sub[0] = sub[1] = 0;
++              e = empty_match[bol];
++              break;
++            }
++
++          int options = 0;
++          if (!bol)
++            options |= PCRE_NOTBOL;
++          if (multiline)
++            options |= PCRE_NO_UTF8_CHECK;
++
++          e = pcre_exec (cre, extra, p, search_bytes, 0,
++                         options, sub, NSUB);
++          if (e != PCRE_ERROR_BADUTF8)
++            {
++              if (0 < e && multiline && sub[1] - sub[0] != 0)
++                {
++                  char const *nl = memchr (p + sub[0], eolbyte,
++                                           sub[1] - sub[0]);
++                  if (nl)
++                    {
++                      /* This match crosses a line boundary; reject it.  */
++                      p += sub[0];
++                      line_end = nl;
++                      continue;
++                    }
++                }
++              break;
++            }
++          int valid_bytes = sub[0];
++
++          /* Try to match the string before the encoding error.
++             Again, handle the empty-match case specially, for speed.  */
++          if (valid_bytes == 0)
++            {
++              sub[1] = 0;
++              e = empty_match[bol];
++            }
++          else
++            e = pcre_exec (cre, extra, p, valid_bytes, 0,
++                           options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL,
++                           sub, NSUB);
++          if (e != PCRE_ERROR_NOMATCH)
++            break;
++
++          /* Treat the encoding error as data that cannot match.  */
++          p += valid_bytes + 1;
++          bol = false;
++        }
++
++      if (e != PCRE_ERROR_NOMATCH)
++        break;
++      bol = true;
+     }
+ 
+   if (e <= 0)
+@@ -171,7 +261,7 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
+       switch (e)
+         {
+         case PCRE_ERROR_NOMATCH:
+-          return -1;
++          break;
+ 
+         case PCRE_ERROR_NOMEMORY:
+           error (EXIT_TROUBLE, 0, _("memory exhausted"));
+@@ -180,10 +270,6 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
+           error (EXIT_TROUBLE, 0,
+                  _("exceeded PCRE's backtracking limit"));
+ 
+-        case PCRE_ERROR_BADUTF8:
+-          error (EXIT_TROUBLE, 0,
+-                 _("invalid UTF-8 byte sequence in input"));
+-
+         default:
+           /* For now, we lump all remaining PCRE failures into this basket.
+              If anyone cares to provide sample grep usage that can trigger
+@@ -192,30 +278,33 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
+           error (EXIT_TROUBLE, 0, _("internal PCRE error: %d"), e);
+         }
+ 
+-      /* NOTREACHED */
+       return -1;
+     }
+   else
+     {
+-      /* Narrow down to the line we've found.  */
+-      char const *beg = line_buf + sub[0];
+-      char const *end = line_buf + sub[1];
+-      char const *buflim = buf + size;
+-      char eol = eolbyte;
+-      if (!start_ptr)
++      char const *matchbeg = p + sub[0];
++      char const *matchend = p + sub[1];
++      char const *beg;
++      char const *end;
++      if (start_ptr)
+         {
+-          /* FIXME: The case when '\n' is not found indicates a bug:
+-             Since grep is line oriented, the match should never contain
+-             a newline, so there _must_ be a newline following.
+-           */
+-          if (!(end = memchr (end, eol, buflim - end)))
+-            end = buflim;
+-          else
+-            end++;
+-          while (buf < beg && beg[-1] != eol)
+-            --beg;
++          beg = matchbeg;
++          end = matchend;
++        }
++      else if (multiline)
++        {
++          char const *prev_nl = memrchr (line_start - 1, eolbyte,
++                                         matchbeg - (line_start - 1));
++          char const *next_nl = memchr (matchend, eolbyte,
++                                        line_end + 1 - matchend);
++          beg = prev_nl + 1;
++          end = next_nl + 1;
++        }
++      else
++        {
++          beg = line_start;
++          end = line_end + 1;
+         }
+-
+       *match_size = end - beg;
+       return beg - buf;
+     }
+diff --git a/src/search.h b/src/search.h
+index 14877bc..e671bea 100644
+--- a/src/search.h
++++ b/src/search.h
+@@ -45,6 +45,7 @@ extern void kwsinit (kwset_t *);
+ 
+ extern char *mbtoupper (char const *, size_t *, mb_len_map_t **);
+ extern void build_mbclen_cache (void);
++extern size_t mbclen_cache[];
+ extern ptrdiff_t mb_goback (char const **, char const *, char const *);
+ extern wint_t mb_prev_wc (char const *, char const *, char const *);
+ extern wint_t mb_next_wc (char const *, char const *);
+diff --git a/src/searchutils.c b/src/searchutils.c
+index 5eb9a12..aba9335 100644
+--- a/src/searchutils.c
++++ b/src/searchutils.c
+@@ -22,7 +22,7 @@
+ 
+ #define NCHAR (UCHAR_MAX + 1)
+ 
+-static size_t mbclen_cache[NCHAR];
++size_t mbclen_cache[NCHAR];
+ 
+ void
+ kwsinit (kwset_t *kwset)
+diff --git a/tests/pcre-infloop b/tests/pcre-infloop
+index 1b33e72..8054844 100755
+--- a/tests/pcre-infloop
++++ b/tests/pcre-infloop
+@@ -18,16 +18,16 @@
+ # along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ 
+ . "${srcdir=.}/init.sh"; path_prepend_ ../src
+-require_pcre_
+ require_timeout_
+ require_en_utf8_locale_
+ require_compiled_in_MB_support
++LC_ALL=en_US.UTF-8 require_pcre_
+ 
+ printf 'a\201b\r' > in || framework_failure_
+ 
+ fail=0
+ 
+ LC_ALL=en_US.UTF-8 timeout 3 grep -P 'a.?..b' in
+-test $? = 2 || fail_ "libpcre's match function appears to infloop"
++test $? = 1 || fail_ "libpcre's match function appears to infloop"
+ 
+ Exit $fail
+diff --git a/tests/pcre-invalid-utf8-input b/tests/pcre-invalid-utf8-input
+index 913e8ee..abcc7e8 100755
+--- a/tests/pcre-invalid-utf8-input
++++ b/tests/pcre-invalid-utf8-input
+@@ -8,14 +8,19 @@
+ # notice and this notice are preserved.
+ 
+ . "${srcdir=.}/init.sh"; path_prepend_ ../src
+-require_pcre_
++require_timeout_
+ require_en_utf8_locale_
++require_compiled_in_MB_support
++LC_ALL=en_US.UTF-8 require_pcre_
+ 
+ fail=0
+ 
+-printf 'j\202\nj\n' > in || framework_failure_
++printf 'j\202j\nj\nk\202\n' > in || framework_failure_
+ 
+-LC_ALL=en_US.UTF-8 grep -P j in
+-test $? -eq 2 || fail=1
++LC_ALL=en_US.UTF-8 timeout 3 grep -P j in
++test $? -eq 0 || fail=1
++
++LC_ALL=en_US.UTF-8 timeout 3 grep -P 'k$' in
++test $? -eq 1 || fail=1
+ 
+ Exit $fail
+diff --git a/tests/pcre-utf8 b/tests/pcre-utf8
+index 41676f4..2dda116 100755
+--- a/tests/pcre-utf8
++++ b/tests/pcre-utf8
+@@ -8,8 +8,8 @@
+ # notice and this notice are preserved.
+ 
+ . "${srcdir=.}/init.sh"; path_prepend_ ../src
+-require_pcre_
+ require_en_utf8_locale_
++LC_ALL=en_US.UTF-8 require_pcre_
+ 
+ fail=0
+ 
diff --git a/grep-2.20-pcre-invalid-utf8-fix.patch b/grep-2.20-pcre-invalid-utf8-fix.patch
deleted file mode 100644
index 5f7530f..0000000
--- a/grep-2.20-pcre-invalid-utf8-fix.patch
+++ /dev/null
@@ -1,136 +0,0 @@
-diff --git a/src/pcresearch.c b/src/pcresearch.c
-index 820dd00..11df488 100644
---- a/src/pcresearch.c
-+++ b/src/pcresearch.c
-@@ -136,34 +136,42 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
- #else
-   /* This array must have at least two elements; everything after that
-      is just for performance improvement in pcre_exec.  */
--  int sub[300];
-+  enum { nsub = 300 };
-+  int sub[nsub];
- 
--  const char *line_buf, *line_end, *line_next;
-+  char const *p = start_ptr ? start_ptr : buf;
-+  int options = p == buf || p[-1] == eolbyte ? 0 : PCRE_NOTBOL;
-+  char const *line_start = buf;
-   int e = PCRE_ERROR_NOMATCH;
--  ptrdiff_t start_ofs = start_ptr ? start_ptr - buf : 0;
-+  char const *line_end;
- 
-   /* PCRE can't limit the matching to single lines, therefore we have to
-      match each line in the buffer separately.  */
--  for (line_next = buf;
--       e == PCRE_ERROR_NOMATCH && line_next < buf + size;
--       start_ofs -= line_next - line_buf)
-+  for (; p < buf + size; p = line_start = line_end + 1)
-     {
--      line_buf = line_next;
--      line_end = memchr (line_buf, eolbyte, (buf + size) - line_buf);
--      if (line_end == NULL)
--        line_next = line_end = buf + size;
--      else
--        line_next = line_end + 1;
--
--      if (start_ptr && start_ptr >= line_end)
--        continue;
-+      line_end = memchr (p, eolbyte, buf + size - p);
- 
--      if (INT_MAX < line_end - line_buf)
-+      if (INT_MAX < line_end - p)
-         error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
- 
--      e = pcre_exec (cre, extra, line_buf, line_end - line_buf,
--                     start_ofs < 0 ? 0 : start_ofs, 0,
--                     sub, sizeof sub / sizeof *sub);
-+      /* Treat encoding-error bytes as data that cannot match.  */
-+      for (;;)
-+        {
-+          e = pcre_exec (cre, extra, p, line_end - p, 0, options, sub, nsub);
-+          if (e != PCRE_ERROR_BADUTF8)
-+            break;
-+          e = pcre_exec (cre, extra, p, sub[0], 0,
-+                         options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL,
-+                         sub, nsub);
-+          if (e != PCRE_ERROR_NOMATCH)
-+            break;
-+          p += sub[0] + 1;
-+          options = PCRE_NOTBOL;
-+        }
-+
-+      if (e != PCRE_ERROR_NOMATCH)
-+        break;
-+      options = 0;
-     }
- 
-   if (e <= 0)
-@@ -180,10 +188,6 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
-           error (EXIT_TROUBLE, 0,
-                  _("exceeded PCRE's backtracking limit"));
- 
--        case PCRE_ERROR_BADUTF8:
--          error (EXIT_TROUBLE, 0,
--                 _("invalid UTF-8 byte sequence in input"));
--
-         default:
-           /* For now, we lump all remaining PCRE failures into this basket.
-              If anyone cares to provide sample grep usage that can trigger
-@@ -197,25 +201,8 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
-     }
-   else
-     {
--      /* Narrow down to the line we've found.  */
--      char const *beg = line_buf + sub[0];
--      char const *end = line_buf + sub[1];
--      char const *buflim = buf + size;
--      char eol = eolbyte;
--      if (!start_ptr)
--        {
--          /* FIXME: The case when '\n' is not found indicates a bug:
--             Since grep is line oriented, the match should never contain
--             a newline, so there _must_ be a newline following.
--           */
--          if (!(end = memchr (end, eol, buflim - end)))
--            end = buflim;
--          else
--            end++;
--          while (buf < beg && beg[-1] != eol)
--            --beg;
--        }
--
-+      char const *beg = start_ptr ? p + sub[0] : line_start;
-+      char const *end = start_ptr ? p + sub[1] : line_end + 1;
-       *match_size = end - beg;
-       return beg - buf;
-     }
-diff --git a/tests/pcre-infloop b/tests/pcre-infloop
-index 1b33e72..b92f8e1 100755
---- a/tests/pcre-infloop
-+++ b/tests/pcre-infloop
-@@ -28,6 +28,6 @@ printf 'a\201b\r' > in || framework_failure_
- fail=0
- 
- LC_ALL=en_US.UTF-8 timeout 3 grep -P 'a.?..b' in
--test $? = 2 || fail_ "libpcre's match function appears to infloop"
-+test $? = 1 || fail_ "libpcre's match function appears to infloop"
- 
- Exit $fail
-diff --git a/tests/pcre-invalid-utf8-input b/tests/pcre-invalid-utf8-input
-index 913e8ee..9da4b18 100755
---- a/tests/pcre-invalid-utf8-input
-+++ b/tests/pcre-invalid-utf8-input
-@@ -13,9 +13,12 @@ require_en_utf8_locale_
- 
- fail=0
- 
--printf 'j\202\nj\n' > in || framework_failure_
-+printf 'j\202j\nj\nk\202\n' > in || framework_failure_
- 
- LC_ALL=en_US.UTF-8 grep -P j in
--test $? -eq 2 || fail=1
-+test $? -eq 0 || fail=1
-+
-+LC_ALL=en_US.UTF-8 grep -P 'k$' in
-+test $? -eq 1 || fail=1
- 
- Exit $fail
diff --git a/grep.spec b/grep.spec
index 1784194..f76f13b 100644
--- a/grep.spec
+++ b/grep.spec
@@ -3,7 +3,7 @@
 Summary: Pattern matching utilities
 Name: grep
 Version: 2.20
-Release: 6%{?dist}
+Release: 7%{?dist}
 License: GPLv3+
 Group: Applications/Text
 Source: ftp://ftp.gnu.org/pub/gnu/grep/grep-%{version}.tar.xz
@@ -16,7 +16,7 @@ Patch0: grep-2.20-man-fix-gs.patch
 # upstream ticket 39445
 Patch1: grep-2.20-help-align.patch
 # backported from upstream
-Patch2: grep-2.20-pcre-invalid-utf8-fix.patch
+Patch2: grep-2.20-pcre-backported-fixes.patch
 URL: http://www.gnu.org/software/grep/
 Requires(post): /sbin/install-info
 Requires(preun): /sbin/install-info
@@ -37,7 +37,7 @@ GNU grep is needed by many scripts, so it shall be installed on every system.
 %setup -q
 %patch0 -p1 -b .man-fix-gs
 %patch1 -p1 -b .help-align
-%patch2 -p1 -b .pcre-invalid-utf8-fix
+%patch2 -p1 -b .pcre-backported-fixes
 
 %build
 %global BUILD_FLAGS $RPM_OPT_FLAGS
@@ -93,6 +93,10 @@ fi
 %{_libexecdir}/grepconf.sh
 
 %changelog
+* Fri Nov 14 2014 Jaroslav Škarvada <jskarvad@redhat.com> - 2.20-7
+- Backported more PCRE fixes (by pcre-backported-fixes patch)
+- Dropped pcre-invalid-utf8-fix patch, handled by pcre-backported-fixes patch
+
 * Tue Nov 11 2014 Jaroslav Škarvada <jskarvad@redhat.com> - 2.20-6
 - Fixed invalid UTF-8 byte sequence error in PCRE mode
   (by pcre-invalid-utf8-fix patch)