- New version: grep-2.7

- Removed patches (already in upstream): dfa-optimize-period, glibc-matcher-fallback, mmap-option-fix, dfa-convert-to-wide-char, dfa-speedup-digit-xdigit
2010-09-21 15:04:45 +02:00 · 2010-09-21 15:04:45 +02:00 · 5ec5b7c5f8
commit 5ec5b7c5f8
parent ddee1abb59
8 changed files with 10 additions and 528 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
 grep-2.6.3.tar.xz
+/grep-2.7.tar.xz
--- a/grep-2.6.3-dfa-convert-to-wide-char.patch
+++ b/grep-2.6.3-dfa-convert-to-wide-char.patch
@ -1,162 +0,0 @@
-From ff191d4667709b52758fcc5bdc568726d1616be4 Mon Sep 17 00:00:00 2001
-From: Paolo Bonzini <bonzini@gnu.org>
-Date: Tue, 4 May 2010 17:26:09 +0200
-Subject: [PATCH] dfa: convert to wide character line-by-line
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-This provides a nice speedup for -m in general, but especially
-it avoids quadratic complexity in case we have to go to glibc.
-
-Testcases:
-
-   # From upstream backref-multibyte-slow
-   yes aba | sed 10000q > aba.txt
-   time ./egrep -c '^([a-z]).\1$' aba.txt
-
-   # From rbiba
-   time grep '^[a-f][h-j][l-ž]$' cestina-sorted.txt
-
-* src/dfa.c (prepare_wc_buf): Extract out of dfaexec.  Convert
-only up to the next newline.
-(dfaexec): Exit multibyte processing loop if past buf_end.
-Call prepare_wc_buf again after processing a newline.
---
- src/dfa.c |   96 +++++++++++++++++++++++++++++++++++++-----------------------
- 1 files changed, 59 insertions(+), 37 deletions(-)
-
-diff --git a/src/dfa.c b/src/dfa.c
-index 523fe05..70aa5a8 100644
--- a/src/dfa.c
-+++ b/src/dfa.c
-@@ -2824,6 +2824,53 @@ transit_state (struct dfa *d, int s, unsigned char const **pp)
- 
- #endif /* MBS_SUPPORT */
- 
-+/* Initialize mblen_buf and inputwcs with data from the next line.  */
-+
-+static void
-+prepare_wc_buf (const char *begin, const char *end)
-+{
-+  unsigned char eol = eolbyte;
-+  size_t remain_bytes, i;
-+
-+  buf_begin = (unsigned char *) begin;
-+
-+  remain_bytes = 0;
-+  for (i = 0; i < end - begin + 1; i++)
-+    {
-+      if (remain_bytes == 0)
-+        {
-+          remain_bytes
-+            = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
-+          if (remain_bytes < 1
-+              || remain_bytes == (size_t) -1
-+              || remain_bytes == (size_t) -2
-+              || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
-+            {
-+              remain_bytes = 0;
-+              inputwcs[i] = (wchar_t)begin[i];
-+              mblen_buf[i] = 0;
-+              if (begin[i] == eol)
-+                break;
-+            }
-+          else
-+            {
-+              mblen_buf[i] = remain_bytes;
-+              remain_bytes--;
-+            }
-+        }
-+      else
-+        {
-+          mblen_buf[i] = remain_bytes;
-+          inputwcs[i] = 0;
-+          remain_bytes--;
-+        }
-+    }
-+
-+  buf_end = (unsigned char *) (begin + i);
-+  mblen_buf[i] = 0;
-+  inputwcs[i] = 0; /* sentinel */
-+}
-+
- /* Search through a buffer looking for a match to the given struct dfa.
-    Find the first occurrence of a string matching the regexp in the
-    buffer, and the shortest possible version thereof.  Return a pointer to
-@@ -2870,43 +2917,10 @@ dfaexec (struct dfa *d, char const *begin, char *end,
- #ifdef MBS_SUPPORT
-   if (d->mb_cur_max > 1)
-     {
-      int remain_bytes, i;
-      buf_begin = (unsigned char *) begin;
-      buf_end = (unsigned char *) end;
-
-      /* initialize mblen_buf, and inputwcs.  */
-       MALLOC(mblen_buf, unsigned char, end - begin + 2);
-       MALLOC(inputwcs, wchar_t, end - begin + 2);
-       memset(&mbs, 0, sizeof(mbstate_t));
-      remain_bytes = 0;
-      for (i = 0; i < end - begin + 1; i++)
-	{
-	  if (remain_bytes == 0)
-	    {
-	      remain_bytes
-		= mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
-	      if (remain_bytes < 1
-		|| (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
-		{
-		  remain_bytes = 0;
-		  inputwcs[i] = (wchar_t)begin[i];
-		  mblen_buf[i] = 0;
-		}
-	      else
-		{
-		  mblen_buf[i] = remain_bytes;
-		  remain_bytes--;
-		}
-	    }
-	  else
-	    {
-	      mblen_buf[i] = remain_bytes;
-	      inputwcs[i] = 0;
-	      remain_bytes--;
-	    }
-	}
-      mblen_buf[i] = 0;
-      inputwcs[i] = 0; /* sentinel */
-+      prepare_wc_buf (p, end);
-     }
- #endif /* MBS_SUPPORT */
- 
-@@ -2916,7 +2930,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
-       if (d->mb_cur_max > 1)
- 	while ((t = trans[s]))
- 	  {
-	    if ((char *) p > end)
-+	    if (p > buf_end)
- 	      break;
- 	    s1 = s;
- 	    SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
-@@ -2985,8 +2999,16 @@ dfaexec (struct dfa *d, char const *begin, char *end,
- 	}
- 
-       /* If the previous character was a newline, count it. */
-      if (count && (char *) p <= end && p[-1] == eol)
-	++*count;
-+      if ((char *) p <= end && p[-1] == eol)
-+        {
-+          if (count)
-+            ++*count;
-+
-+#ifdef MBS_SUPPORT
-+          if (d->mb_cur_max > 1)
-+            prepare_wc_buf (p, end);
-+#endif
-+        }
- 
-       /* Check if we've run off the end of the buffer. */
-       if ((char *) p > end)
-- 
-1.6.6.1
-
--- a/grep-2.6.3-dfa-optimize-period.patch
+++ b/grep-2.6.3-dfa-optimize-period.patch
@ -1,165 +0,0 @@
-From 01422220ebf40f829c1f00418a96873b82f206ff Mon Sep 17 00:00:00 2001
-From: Paolo Bonzini <bonzini@gnu.org>
-Date: Mon, 19 Apr 2010 14:50:23 +0200
-Subject: [PATCH 1/2] dfa: optimize UTF-8 period
-
-Backport of upstream commits 7a0ad00 and 42ac56a.
-
-* src/dfa.h (struct dfa): Add utf8_anychar_classes.
-* src/dfa.c (add_utf8_anychar): New.
-(atom): Simplify if/else nesting.  Call add_utf8_anychar for ANYCHAR
-in UTF-8 locales.
-(dfaoptimize): Abort on ANYCHAR.
---
- src/dfa.c |   95 ++++++++++++++++++++++++++++++++++++++++++++++++++++---------
- src/dfa.h |    1 +
- 2 files changed, 82 insertions(+), 14 deletions(-)
-
-diff --git a/src/dfa.c b/src/dfa.c
-index ba78b08..e13c361 100644
--- a/src/dfa.c
-+++ b/src/dfa.c
-@@ -1191,6 +1191,55 @@ addtok_wc (wint_t wc)
- }
- #endif
- 
-+static void
-+add_utf8_anychar (void)
-+{
-+  static const charclass utf8_classes[5] = {
-+      {  0,  0,  0,  0, ~0, ~0, 0, 0 },            /* 80-bf: non-lead bytes */
-+      { ~0, ~0, ~0, ~0, ~0, ~0, 0, 0xff000000 },   /* 00-bf, f8-ff: 1-byte/invalid */
-+      {  0,  0,  0,  0,  0,  0, ~0, 0 },           /* c0-df: 2-byte sequence */
-+      {  0,  0,  0,  0,  0,  0,  0, 0xffff },      /* e0-ef: 3-byte sequence */
-+      {  0,  0,  0,  0,  0,  0,  0, 0xff0000 }     /* f0-f7: 4-byte sequence */
-+  };
-+  const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]);
-+  unsigned int i;
-+
-+  /* Define the five character classes that are needed below.  */
-+  if (dfa->utf8_anychar_classes[0] == 0)
-+    for (i = 0; i < n; i++)
-+      {
-+        charclass c;
-+        memcpy (c, utf8_classes[i], sizeof c);
-+        if (i == 1)
-+          {
-+            if (!(syntax_bits & RE_DOT_NEWLINE))
-+              clrbit (eolbyte, c);
-+            if (syntax_bits & RE_DOT_NOT_NULL)
-+              clrbit ('\0', c);
-+          }
-+        dfa->utf8_anychar_classes[i] = CSET + charclass_index(c);
-+      }
-+
-+  /* A valid UTF-8 character is
-+
-+          ([0x00-0x7f]
-+           |[0xc2-0xdf][0x80-0xbf]
-+           |[0xe0-0xef[0x80-0xbf][0x80-0xbf]
-+           |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf])
-+
-+     which I'll write more concisely "B|CA|DAA|EAAA".  Factor the [0x80-0xbf]
-+     and you get "B|(C|(D|EA)A)A".  And since the token buffer is in reverse
-+     Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR".  */
-+  for (i = 1; i < n; i++)
-+    addtok (dfa->utf8_anychar_classes[i]);
-+  while (--i > 1)
-+    {
-+      addtok (dfa->utf8_anychar_classes[0]);
-+      addtok (CAT);
-+      addtok (OR);
-+    }
-+}
-+
- /* The grammar understood by the parser is as follows.
- 
-    regexp:
-@@ -1229,8 +1278,12 @@ addtok_wc (wint_t wc)
- static void
- atom (void)
- {
-+  if (0)
-+    {
-+      /* empty */
-+    }
- #ifdef MBS_SUPPORT
-  if (tok == WCHAR)
-+  else if (tok == WCHAR)
-     {
-       addtok_wc (case_fold ? towlower(wctok) : wctok);
- #ifndef GREP
-@@ -1242,16 +1295,28 @@ atom (void)
- #endif
- 
-       tok = lex();
-      return;
-+    }
-+
-+  else if (tok == ANYCHAR && using_utf8())
-+    {
-+      /* For UTF-8 expand the period to a series of CSETs that define a valid
-+	 UTF-8 character.  This avoids using the slow multibyte path.  I'm
-+	 pretty sure it would be both profitable and correct to do it for
-+	 any encoding; however, the optimization must be done manually as
-+	 it is done above in add_utf8_anychar.	So, let's start with
-+	 UTF-8: it is the most used, and the structure of the encoding
-+	 makes the correctness more obvious.  */
-+      add_utf8_anychar();
-+      tok = lex();
-     }
- #endif /* MBS_SUPPORT  */
- 
-  if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
-      || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
-+  else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
-+       	   || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
- #ifdef MBS_SUPPORT
-      || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */
-+     	   || tok == ANYCHAR || tok == MBCSET
- #endif /* MBS_SUPPORT */
-      || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
-+	   || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
-     {
-       addtok(tok);
-       tok = lex();
-@@ -3027,14 +3092,16 @@ dfaoptimize (struct dfa *d)
-   for (i = 0; i < d->tindex; ++i)
-     {
-       switch(d->tokens[i])
-	{
-	case ANYCHAR:
-	case MBCSET:
-	  /* Requires multi-byte algorithm.  */
-	  return;
-	default:
-	  break;
-	}
-+        {
-+        case ANYCHAR:
-+          /* Lowered.  */
-+          abort ();
-+        case MBCSET:
-+          /* Requires multi-byte algorithm.  */
-+          return;
-+        default:
-+          break;
-+        }
-     }
- 
-   free_mbdata (d);
-diff --git a/src/dfa.h b/src/dfa.h
-index 1c85207..42c177a 100644
--- a/src/dfa.h
-+++ b/src/dfa.h
-@@ -283,6 +283,7 @@ struct dfa
- 				   with dfaparse(). */
- #ifdef MBS_SUPPORT
-   unsigned int mb_cur_max;	/* Cached value of MB_CUR_MAX.  */
-+  int utf8_anychar_classes[5];	/* To lower ANYCHAR in UTF-8 locales.  */
- 
-   /* The following are used only if MB_CUR_MAX > 1.  */
- 
-- 
-1.6.6.1
-
--- a/grep-2.6.3-dfa-speedup-digit-xdigit.patch
+++ b/grep-2.6.3-dfa-speedup-digit-xdigit.patch
@ -1,114 +0,0 @@
-From ebca24d6c9eb12f91eed3993de65945ee97dd467 Mon Sep 17 00:00:00 2001
-From: Paolo Bonzini <bonzini@gnu.org>
-Date: Tue, 4 May 2010 18:07:28 +0200
-Subject: [PATCH] dfa: speed up [[:digit:]] and [[:xdigit:]]
-
-There's no "multibyte pain" in these two classes, since POSIX
-and ISO C99 mandate their contents.
-
-Time for "./grep -x '[[:digit:]]' /usr/share/dict/linux.words"
-Before: 1.5s, after: 0.07s.  (sed manages only 0.5s).
-
-* src/dfa.c (predicates): Declare struct dfa_ctype separately
-from definition.  Add sb_only.
-(find_pred): Return const struct dfa_ctype *.
-(parse_bracket_exp): Return const struct dfa_ctype *.  Do
-not fill MBCSET for sb_only character types.
---
- src/dfa.c |   55 ++++++++++++++++++++++++++++---------------------------
- 1 files changed, 28 insertions(+), 27 deletions(-)
-
-diff --git a/src/dfa.c b/src/dfa.c
-index 4dd26c9..da5a306 100644
--- a/src/dfa.c
-+++ b/src/dfa.c
-@@ -429,26 +429,29 @@ typedef int predicate (int);
- /* The following list maps the names of the Posix named character classes
-    to predicate functions that determine whether a given character is in
-    the class.  The leading [ has already been eaten by the lexical analyzer. */
-static struct {
-+struct dfa_ctype {
-   const char *name;
-  predicate *pred;
-} const prednames[] = {
-  { "alpha", is_alpha },
-  { "upper", is_upper },
-  { "lower", is_lower },
-  { "digit", is_digit },
-  { "xdigit", is_xdigit },
-  { "space", is_space },
-  { "punct", is_punct },
-  { "alnum", is_alnum },
-  { "print", is_print },
-  { "graph", is_graph },
-  { "cntrl", is_cntrl },
-  { "blank", is_blank },
-  { NULL, NULL }
-+  predicate *func;
-+  bool sb_only;
- };
- 
-static predicate *
-+static const struct dfa_ctype prednames[] = {
-+  { "alpha", isalpha, false },
-+  { "upper", isupper, false },
-+  { "lower", islower, false },
-+  { "digit", isdigit, true },
-+  { "xdigit", isxdigit, true },
-+  { "space", isspace, false },
-+  { "punct", ispunct, false },
-+  { "alnum", isalnum, false },
-+  { "print", isprint, false },
-+  { "graph", isgraph, false },
-+  { "cntrl", iscntrl, false },
-+  { "blank", isblank, false },
-+  { NULL, NULL, false }
-+};
-+
-+static const struct dfa_ctype *
- find_pred (const char *str)
- {
-   unsigned int i;
-@@ -456,7 +459,7 @@ find_pred (const char *str)
-     if (!strcmp(str, prednames[i].name))
-       break;
- 
-  return prednames[i].pred;
-+  return &prednames[i];
- }
- 
- /* Multibyte character handling sub-routine for lex.
-@@ -553,8 +556,11 @@ parse_bracket_exp (void)
- 				     || !strcmp (str, "lower"))
- 				       ? "alpha"
- 				       : str);
-+                  const struct dfa_ctype *pred = find_pred (class);
-+                  if (!pred)
-+                    dfaerror(_("invalid character class"));
- #ifdef MBS_SUPPORT
-                  if (MB_CUR_MAX > 1)
-+                  if (MB_CUR_MAX > 1 && !pred->sb_only)
-                     {
- 		      /* Store the character class as wctype_t.  */
-                       wctype_t wt = wctype (class);
-@@ -568,14 +574,9 @@ parse_bracket_exp (void)
-                     }
- #endif
- 
-                  {
-                    predicate *pred = find_pred (class);
-                    if (!pred)
-                      dfaerror(_("invalid character class"));
-                    for (c2 = 0; c2 < NOTCHAR; ++c2)
-                      if ((*pred)(c2))
-                        setbit_case_fold (c2, ccl);
-                  }
-+                  for (c2 = 0; c2 < NOTCHAR; ++c2)
-+                    if (pred->func(c2))
-+                      setbit_case_fold (c2, ccl);
-                 }
- 
- #ifdef MBS_SUPPORT
-- 
-1.6.6.1
-
--- a/grep-2.6.3-glibc-matcher-fallback.patch
+++ b/grep-2.6.3-glibc-matcher-fallback.patch
@ -1,38 +0,0 @@
-From 3fca11d78cfa1fec6199936d57871b9db08226ab Mon Sep 17 00:00:00 2001
-From: Paolo Bonzini <bonzini@gnu.org>
-Date: Thu, 29 Apr 2010 17:13:32 +0200
-Subject: [PATCH 2/2] fall back to glibc matcher if a MBCSET is found
-
-This patch works around the performance problems of multibyte grep
-upstream.
-
-For UTF-8 it should trigger only in the presence of MBCSET, e.g. [a-z].
-
-For other character sets all brackets and `.` as well will trigger it.
---
- src/dfa.c |    9 +++++++++
- 1 files changed, 9 insertions(+), 0 deletions(-)
-
-diff --git a/src/dfa.c b/src/dfa.c
-index e13c361..523fe05 100644
--- a/src/dfa.c
-+++ b/src/dfa.c
-@@ -2927,6 +2927,15 @@ dfaexec (struct dfa *d, char const *begin, char *end,
- 		continue;
- 	      }
- 
-+	    if (backref)
-+              {
-+                *backref = 1;
-+                free(mblen_buf);
-+                free(inputwcs);
-+                *end = saved_end;
-+                return (char *) p;
-+              }
-+
- 	    /* Can match with a multibyte character (and multi character
- 	       collating element).  Transition table might be updated.  */
- 	    s = transit_state(d, s, &p);
-- 
-1.6.6.1
-
--- a/grep-2.6.3-mmap-option-fix.patch
+++ b/grep-2.6.3-mmap-option-fix.patch
@ -1,26 +0,0 @@
-From dfa2891e473abdb5507fff65002f946b60145f44 Mon Sep 17 00:00:00 2001
-From: Paolo Bonzini <bonzini@gnu.org>
-Date: Tue, 20 Apr 2010 12:32:22 +0200
-Subject: [PATCH] grep: fix --mmap not being ignored
-
-* NEWS: Document bugfix.
-* main.c (main): Ignore MMAP_OPTION.
---
- src/main.c |    1 +
- 1 files changed, 1 insertions(+), 0 deletions(-)
-
-diff --git a/src/main.c b/src/main.c
-index 1697c80..0ccf6eb 100644
--- a/src/main.c
-+++ b/src/main.c
-@@ -2061,6 +2061,7 @@ main (int argc, char **argv)
- 	label = optarg;
- 	break;
- 
-+      case MMAP_OPTION:
-       case 0:
- 	/* long options */
- 	break;
-- 
-1.6.6.1
-
--- a/grep.spec
+++ b/grep.spec
@ -2,8 +2,8 @@

 Summary: Pattern matching utilities
 Name: grep
-Version: 2.6.3
-Release: 4%{?dist}
+Version: 2.7
+Release: 1%{?dist}
 License: GPLv3+
 Group: Applications/Text
 Source: ftp://ftp.gnu.org/pub/gnu/grep/grep-%{version}.tar.xz
@ -16,21 +16,6 @@ Requires(preun): /sbin/install-info
 BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX)
 BuildRequires: pcre-devel >= 3.9-10, texinfo, gettext
 BuildRequires: autoconf automake
-# Speedups DFA UTF-8 period patterns.
-# original name: 0001-dfa-optimize-UTF-8-period.patch
-Patch0: grep-2.6.3-dfa-optimize-period.patch
-# Speedups MBCSET ([a-z]) UTF-8 patterns.
-# original name: 0002-fall-back-to-glibc-matcher-if-a-MBCSET-is-found.patch
-Patch1: grep-2.6.3-glibc-matcher-fallback.patch
-# Deprecated --mmap option is now properly ignored.
-# 0003-grep-fix-mmap-not-being-ignored.patch
-Patch2: grep-2.6.3-mmap-option-fix.patch
-# Speedups -m and removes quadratic complexity when going to glibc.
-# 0004-dfa-convert-to-wide-character-line-by-line.patch
-Patch3: grep-2.6.3-dfa-convert-to-wide-char.patch
-# Speedups DFA [[:digit:]] and [[:xdigit:]] patterns.
-# 0005-dfa-speed-up-digit-and-xdigit.patch
-Patch4: grep-2.6.3-dfa-speedup-digit-xdigit.patch

 %description
 The GNU versions of commonly used grep utilities. Grep searches through
@ -41,11 +26,6 @@ GNU grep is needed by many scripts, so it shall be installed on every system.

 %prep
 %setup -q
-%patch0 -p1 -b .dfa-optimize-period
-%patch1 -p1 -b .glibc-matcher-fallback
-%patch2 -p1 -b .mmap-option-fix
-%patch3 -p1 -b .dfa-convert-to-wide-char
-%patch4 -p1 -b .speedup-digit-xdigit

 %build
 %configure --without-included-regex CPPFLAGS="-I%{_includedir}/pcre"
@ -87,6 +67,12 @@ fi
 %{_mandir}/*/*

 %changelog
+* Tue Sep 21 2010 Jaroslav Škarvada <jskarvad@redhat.com> - 2.7-1
+- New version: grep-2.7
+- Removed patches (already in upstream): dfa-optimize-period,
+  glibc-matcher-fallback, mmap-option-fix, dfa-convert-to-wide-char,
+  dfa-speedup-digit-xdigit
+
 * Fri Jun 11 2010 Jaroslav Škarvada <jskarvad@redhat.com> - 2.6.3-4
 - Colors can be globally disabled via /etc/GREP_COLORS (#602867)
 - Fixed indentation in spec
--- a/2
+++ b/2
@ -1 +1 @@
-69a3bf508a3f14d12369e0e1c7a92763  grep-2.6.3.tar.xz
+6dd9931a52501519d7779a27cf953326  grep-2.7.tar.xz