- Added dfa-optimize-period patch (speedup for . patterns in UTF-8)

- Added glibc-matcher-fallback patch (speedup for [a-z] patterns in UTF-8) - Added mmap-option-fix patch - Added dfa-convert-to-wide-char patch (speedup for -m and remove quadratic complexity when going to glibc) - Added dfa-speedup-digit-xdigit patch (speedup for [[:digit:]] [:xdigit:]])
2010-05-07 11:34:53 +00:00 · 2010-05-07 11:34:53 +00:00 · 6ccb1073e9
commit 6ccb1073e9
parent 532439cc73
6 changed files with 535 additions and 2 deletions
--- a/grep-2.6.3-dfa-convert-to-wide-char.patch
+++ b/grep-2.6.3-dfa-convert-to-wide-char.patch
@ -0,0 +1,162 @@
 From ff191d4667709b52758fcc5bdc568726d1616be4 Mon Sep 17 00:00:00 2001
 From: Paolo Bonzini <bonzini@gnu.org>
 Date: Tue, 4 May 2010 17:26:09 +0200
 Subject: [PATCH] dfa: convert to wide character line-by-line
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 This provides a nice speedup for -m in general, but especially
 it avoids quadratic complexity in case we have to go to glibc.
 Testcases:
   # From upstream backref-multibyte-slow
   yes aba | sed 10000q > aba.txt
   time ./egrep -c '^([a-z]).\1$' aba.txt
   # From rbiba
   time grep '^[a-f][h-j][l-ž]$' cestina-sorted.txt
 * src/dfa.c (prepare_wc_buf): Extract out of dfaexec.  Convert
 only up to the next newline.
 (dfaexec): Exit multibyte processing loop if past buf_end.
 Call prepare_wc_buf again after processing a newline.
 ---
 src/dfa.c |   96 +++++++++++++++++++++++++++++++++++++-----------------------
 1 files changed, 59 insertions(+), 37 deletions(-)
 diff --git a/src/dfa.c b/src/dfa.c
 index 523fe05..70aa5a8 100644
 --- a/src/dfa.c
 +++ b/src/dfa.c
@@ -2824,6 +2824,53 @@ transit_state (struct dfa *d, int s, unsigned char const **pp)
 #endif /* MBS_SUPPORT */
 +/* Initialize mblen_buf and inputwcs with data from the next line.  */
 +
 +static void
 +prepare_wc_buf (const char *begin, const char *end)
 +{
 +  unsigned char eol = eolbyte;
 +  size_t remain_bytes, i;
 +
 +  buf_begin = (unsigned char *) begin;
 +
 +  remain_bytes = 0;
 +  for (i = 0; i < end - begin + 1; i++)
 +    {
 +      if (remain_bytes == 0)
 +        {
 +          remain_bytes
 +            = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
 +          if (remain_bytes < 1
 +              || remain_bytes == (size_t) -1
 +              || remain_bytes == (size_t) -2
 +              || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
 +            {
 +              remain_bytes = 0;
 +              inputwcs[i] = (wchar_t)begin[i];
 +              mblen_buf[i] = 0;
 +              if (begin[i] == eol)
 +                break;
 +            }
 +          else
 +            {
 +              mblen_buf[i] = remain_bytes;
 +              remain_bytes--;
 +            }
 +        }
 +      else
 +        {
 +          mblen_buf[i] = remain_bytes;
 +          inputwcs[i] = 0;
 +          remain_bytes--;
 +        }
 +    }
 +
 +  buf_end = (unsigned char *) (begin + i);
 +  mblen_buf[i] = 0;
 +  inputwcs[i] = 0; /* sentinel */
 +}
 +
 /* Search through a buffer looking for a match to the given struct dfa.
    Find the first occurrence of a string matching the regexp in the
    buffer, and the shortest possible version thereof.  Return a pointer to
@@ -2870,43 +2917,10 @@ dfaexec (struct dfa *d, char const *begin, char *end,
 #ifdef MBS_SUPPORT
   if (d->mb_cur_max > 1)
     {
 -      int remain_bytes, i;
 -      buf_begin = (unsigned char *) begin;
 -      buf_end = (unsigned char *) end;
 -
 -      /* initialize mblen_buf, and inputwcs.  */
       MALLOC(mblen_buf, unsigned char, end - begin + 2);
       MALLOC(inputwcs, wchar_t, end - begin + 2);
       memset(&mbs, 0, sizeof(mbstate_t));
 -      remain_bytes = 0;
 -      for (i = 0; i < end - begin + 1; i++)
 -	{
 -	  if (remain_bytes == 0)
 -	    {
 -	      remain_bytes
 -		= mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
 -	      if (remain_bytes < 1
 -		|| (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
 -		{
 -		  remain_bytes = 0;
 -		  inputwcs[i] = (wchar_t)begin[i];
 -		  mblen_buf[i] = 0;
 -		}
 -	      else
 -		{
 -		  mblen_buf[i] = remain_bytes;
 -		  remain_bytes--;
 -		}
 -	    }
 -	  else
 -	    {
 -	      mblen_buf[i] = remain_bytes;
 -	      inputwcs[i] = 0;
 -	      remain_bytes--;
 -	    }
 -	}
 -      mblen_buf[i] = 0;
 -      inputwcs[i] = 0; /* sentinel */
 +      prepare_wc_buf (p, end);
     }
 #endif /* MBS_SUPPORT */
@@ -2916,7 +2930,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
       if (d->mb_cur_max > 1)
 	while ((t = trans[s]))
 	  {
 -	    if ((char *) p > end)
 +	    if (p > buf_end)
 	      break;
 	    s1 = s;
 	    SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
@@ -2985,8 +2999,16 @@ dfaexec (struct dfa *d, char const *begin, char *end,
 	}
       /* If the previous character was a newline, count it. */
 -      if (count && (char *) p <= end && p[-1] == eol)
 -	++*count;
 +      if ((char *) p <= end && p[-1] == eol)
 +        {
 +          if (count)
 +            ++*count;
 +
 +#ifdef MBS_SUPPORT
 +          if (d->mb_cur_max > 1)
 +            prepare_wc_buf (p, end);
 +#endif
 +        }
       /* Check if we've run off the end of the buffer. */
       if ((char *) p > end)
 -- 
 1.6.6.1
--- a/grep-2.6.3-dfa-optimize-period.patch
+++ b/grep-2.6.3-dfa-optimize-period.patch
@ -0,0 +1,165 @@
 From 01422220ebf40f829c1f00418a96873b82f206ff Mon Sep 17 00:00:00 2001
 From: Paolo Bonzini <bonzini@gnu.org>
 Date: Mon, 19 Apr 2010 14:50:23 +0200
 Subject: [PATCH 1/2] dfa: optimize UTF-8 period
 Backport of upstream commits 7a0ad00 and 42ac56a.
 * src/dfa.h (struct dfa): Add utf8_anychar_classes.
 * src/dfa.c (add_utf8_anychar): New.
 (atom): Simplify if/else nesting.  Call add_utf8_anychar for ANYCHAR
 in UTF-8 locales.
 (dfaoptimize): Abort on ANYCHAR.
 ---
 src/dfa.c |   95 ++++++++++++++++++++++++++++++++++++++++++++++++++++---------
 src/dfa.h |    1 +
 2 files changed, 82 insertions(+), 14 deletions(-)
 diff --git a/src/dfa.c b/src/dfa.c
 index ba78b08..e13c361 100644
 --- a/src/dfa.c
 +++ b/src/dfa.c
@@ -1191,6 +1191,55 @@ addtok_wc (wint_t wc)
 }
 #endif
 +static void
 +add_utf8_anychar (void)
 +{
 +  static const charclass utf8_classes[5] = {
 +      {  0,  0,  0,  0, ~0, ~0, 0, 0 },            /* 80-bf: non-lead bytes */
 +      { ~0, ~0, ~0, ~0, ~0, ~0, 0, 0xff000000 },   /* 00-bf, f8-ff: 1-byte/invalid */
 +      {  0,  0,  0,  0,  0,  0, ~0, 0 },           /* c0-df: 2-byte sequence */
 +      {  0,  0,  0,  0,  0,  0,  0, 0xffff },      /* e0-ef: 3-byte sequence */
 +      {  0,  0,  0,  0,  0,  0,  0, 0xff0000 }     /* f0-f7: 4-byte sequence */
 +  };
 +  const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]);
 +  unsigned int i;
 +
 +  /* Define the five character classes that are needed below.  */
 +  if (dfa->utf8_anychar_classes[0] == 0)
 +    for (i = 0; i < n; i++)
 +      {
 +        charclass c;
 +        memcpy (c, utf8_classes[i], sizeof c);
 +        if (i == 1)
 +          {
 +            if (!(syntax_bits & RE_DOT_NEWLINE))
 +              clrbit (eolbyte, c);
 +            if (syntax_bits & RE_DOT_NOT_NULL)
 +              clrbit ('\0', c);
 +          }
 +        dfa->utf8_anychar_classes[i] = CSET + charclass_index(c);
 +      }
 +
 +  /* A valid UTF-8 character is
 +
 +          ([0x00-0x7f]
 +           |[0xc2-0xdf][0x80-0xbf]
 +           |[0xe0-0xef[0x80-0xbf][0x80-0xbf]
 +           |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf])
 +
 +     which I'll write more concisely "B|CA|DAA|EAAA".  Factor the [0x80-0xbf]
 +     and you get "B|(C|(D|EA)A)A".  And since the token buffer is in reverse
 +     Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR".  */
 +  for (i = 1; i < n; i++)
 +    addtok (dfa->utf8_anychar_classes[i]);
 +  while (--i > 1)
 +    {
 +      addtok (dfa->utf8_anychar_classes[0]);
 +      addtok (CAT);
 +      addtok (OR);
 +    }
 +}
 +
 /* The grammar understood by the parser is as follows.
    regexp:
@@ -1229,8 +1278,12 @@ addtok_wc (wint_t wc)
 static void
 atom (void)
 {
 +  if (0)
 +    {
 +      /* empty */
 +    }
 #ifdef MBS_SUPPORT
 -  if (tok == WCHAR)
 +  else if (tok == WCHAR)
     {
       addtok_wc (case_fold ? towlower(wctok) : wctok);
 #ifndef GREP
@@ -1242,16 +1295,28 @@ atom (void)
 #endif
       tok = lex();
 -      return;
 +    }
 +
 +  else if (tok == ANYCHAR && using_utf8())
 +    {
 +      /* For UTF-8 expand the period to a series of CSETs that define a valid
 +	 UTF-8 character.  This avoids using the slow multibyte path.  I'm
 +	 pretty sure it would be both profitable and correct to do it for
 +	 any encoding; however, the optimization must be done manually as
 +	 it is done above in add_utf8_anychar.	So, let's start with
 +	 UTF-8: it is the most used, and the structure of the encoding
 +	 makes the correctness more obvious.  */
 +      add_utf8_anychar();
 +      tok = lex();
     }
 #endif /* MBS_SUPPORT  */
 -  if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
 -      || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
 +  else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
 +       	   || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
 #ifdef MBS_SUPPORT
 -      || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */
 +     	   || tok == ANYCHAR || tok == MBCSET
 #endif /* MBS_SUPPORT */
 -      || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
 +	   || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
     {
       addtok(tok);
       tok = lex();
@@ -3027,14 +3092,16 @@ dfaoptimize (struct dfa *d)
   for (i = 0; i < d->tindex; ++i)
     {
       switch(d->tokens[i])
 -	{
 -	case ANYCHAR:
 -	case MBCSET:
 -	  /* Requires multi-byte algorithm.  */
 -	  return;
 -	default:
 -	  break;
 -	}
 +        {
 +        case ANYCHAR:
 +          /* Lowered.  */
 +          abort ();
 +        case MBCSET:
 +          /* Requires multi-byte algorithm.  */
 +          return;
 +        default:
 +          break;
 +        }
     }
   free_mbdata (d);
 diff --git a/src/dfa.h b/src/dfa.h
 index 1c85207..42c177a 100644
 --- a/src/dfa.h
 +++ b/src/dfa.h
@@ -283,6 +283,7 @@ struct dfa
 				   with dfaparse(). */
 #ifdef MBS_SUPPORT
   unsigned int mb_cur_max;	/* Cached value of MB_CUR_MAX.  */
 +  int utf8_anychar_classes[5];	/* To lower ANYCHAR in UTF-8 locales.  */
   /* The following are used only if MB_CUR_MAX > 1.  */
 -- 
 1.6.6.1
--- a/grep-2.6.3-dfa-speedup-digit-xdigit.patch
+++ b/grep-2.6.3-dfa-speedup-digit-xdigit.patch
@ -0,0 +1,114 @@
 From ebca24d6c9eb12f91eed3993de65945ee97dd467 Mon Sep 17 00:00:00 2001
 From: Paolo Bonzini <bonzini@gnu.org>
 Date: Tue, 4 May 2010 18:07:28 +0200
 Subject: [PATCH] dfa: speed up [[:digit:]] and [[:xdigit:]]
 There's no "multibyte pain" in these two classes, since POSIX
 and ISO C99 mandate their contents.
 Time for "./grep -x '[[:digit:]]' /usr/share/dict/linux.words"
 Before: 1.5s, after: 0.07s.  (sed manages only 0.5s).
 * src/dfa.c (predicates): Declare struct dfa_ctype separately
 from definition.  Add sb_only.
 (find_pred): Return const struct dfa_ctype *.
 (parse_bracket_exp): Return const struct dfa_ctype *.  Do
 not fill MBCSET for sb_only character types.
 ---
 src/dfa.c |   55 ++++++++++++++++++++++++++++---------------------------
 1 files changed, 28 insertions(+), 27 deletions(-)
 diff --git a/src/dfa.c b/src/dfa.c
 index 4dd26c9..da5a306 100644
 --- a/src/dfa.c
 +++ b/src/dfa.c
@@ -429,26 +429,29 @@ typedef int predicate (int);
 /* The following list maps the names of the Posix named character classes
    to predicate functions that determine whether a given character is in
    the class.  The leading [ has already been eaten by the lexical analyzer. */
 -static struct {
 +struct dfa_ctype {
   const char *name;
 -  predicate *pred;
 -} const prednames[] = {
 -  { "alpha", is_alpha },
 -  { "upper", is_upper },
 -  { "lower", is_lower },
 -  { "digit", is_digit },
 -  { "xdigit", is_xdigit },
 -  { "space", is_space },
 -  { "punct", is_punct },
 -  { "alnum", is_alnum },
 -  { "print", is_print },
 -  { "graph", is_graph },
 -  { "cntrl", is_cntrl },
 -  { "blank", is_blank },
 -  { NULL, NULL }
 +  predicate *func;
 +  bool sb_only;
 };
 -static predicate *
 +static const struct dfa_ctype prednames[] = {
 +  { "alpha", isalpha, false },
 +  { "upper", isupper, false },
 +  { "lower", islower, false },
 +  { "digit", isdigit, true },
 +  { "xdigit", isxdigit, true },
 +  { "space", isspace, false },
 +  { "punct", ispunct, false },
 +  { "alnum", isalnum, false },
 +  { "print", isprint, false },
 +  { "graph", isgraph, false },
 +  { "cntrl", iscntrl, false },
 +  { "blank", isblank, false },
 +  { NULL, NULL, false }
 +};
 +
 +static const struct dfa_ctype *
 find_pred (const char *str)
 {
   unsigned int i;
@@ -456,7 +459,7 @@ find_pred (const char *str)
     if (!strcmp(str, prednames[i].name))
       break;
 -  return prednames[i].pred;
 +  return &prednames[i];
 }
 /* Multibyte character handling sub-routine for lex.
@@ -553,8 +556,11 @@ parse_bracket_exp (void)
 				     || !strcmp (str, "lower"))
 				       ? "alpha"
 				       : str);
 +                  const struct dfa_ctype *pred = find_pred (class);
 +                  if (!pred)
 +                    dfaerror(_("invalid character class"));
 #ifdef MBS_SUPPORT
 -                  if (MB_CUR_MAX > 1)
 +                  if (MB_CUR_MAX > 1 && !pred->sb_only)
                     {
 		      /* Store the character class as wctype_t.  */
                       wctype_t wt = wctype (class);
@@ -568,14 +574,9 @@ parse_bracket_exp (void)
                     }
 #endif
 -                  {
 -                    predicate *pred = find_pred (class);
 -                    if (!pred)
 -                      dfaerror(_("invalid character class"));
 -                    for (c2 = 0; c2 < NOTCHAR; ++c2)
 -                      if ((*pred)(c2))
 -                        setbit_case_fold (c2, ccl);
 -                  }
 +                  for (c2 = 0; c2 < NOTCHAR; ++c2)
 +                    if (pred->func(c2))
 +                      setbit_case_fold (c2, ccl);
                 }
 #ifdef MBS_SUPPORT
 -- 
 1.6.6.1
--- a/grep-2.6.3-glibc-matcher-fallback.patch
+++ b/grep-2.6.3-glibc-matcher-fallback.patch
@ -0,0 +1,38 @@
 From 3fca11d78cfa1fec6199936d57871b9db08226ab Mon Sep 17 00:00:00 2001
 From: Paolo Bonzini <bonzini@gnu.org>
 Date: Thu, 29 Apr 2010 17:13:32 +0200
 Subject: [PATCH 2/2] fall back to glibc matcher if a MBCSET is found
 This patch works around the performance problems of multibyte grep
 upstream.
 For UTF-8 it should trigger only in the presence of MBCSET, e.g. [a-z].
 For other character sets all brackets and `.` as well will trigger it.
 ---
 src/dfa.c |    9 +++++++++
 1 files changed, 9 insertions(+), 0 deletions(-)
 diff --git a/src/dfa.c b/src/dfa.c
 index e13c361..523fe05 100644
 --- a/src/dfa.c
 +++ b/src/dfa.c
@@ -2927,6 +2927,15 @@ dfaexec (struct dfa *d, char const *begin, char *end,
 		continue;
 	      }
 +	    if (backref)
 +              {
 +                *backref = 1;
 +                free(mblen_buf);
 +                free(inputwcs);
 +                *end = saved_end;
 +                return (char *) p;
 +              }
 +
 	    /* Can match with a multibyte character (and multi character
 	       collating element).  Transition table might be updated.  */
 	    s = transit_state(d, s, &p);
 -- 
 1.6.6.1
--- a/grep-2.6.3-mmap-option-fix.patch
+++ b/grep-2.6.3-mmap-option-fix.patch
@ -0,0 +1,26 @@
 From dfa2891e473abdb5507fff65002f946b60145f44 Mon Sep 17 00:00:00 2001
 From: Paolo Bonzini <bonzini@gnu.org>
 Date: Tue, 20 Apr 2010 12:32:22 +0200
 Subject: [PATCH] grep: fix --mmap not being ignored
 * NEWS: Document bugfix.
 * main.c (main): Ignore MMAP_OPTION.
 ---
 src/main.c |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)
 diff --git a/src/main.c b/src/main.c
 index 1697c80..0ccf6eb 100644
 --- a/src/main.c
 +++ b/src/main.c
@@ -2061,6 +2061,7 @@ main (int argc, char **argv)
 	label = optarg;
 	break;
 +      case MMAP_OPTION:
       case 0:
 	/* long options */
 	break;
 -- 
 1.6.6.1
--- a/grep.spec
+++ b/grep.spec
@ -3,7 +3,7 @@
 Summary: Pattern matching utilities
 Name: grep
 Version: 2.6.3
-Release: 1%{?dist}
+Release: 2%{?dist}
 License: GPLv3+
 Group: Applications/Text
 Source: ftp://ftp.gnu.org/pub/gnu/grep/grep-%{version}.tar.xz
@ -13,6 +13,21 @@ Requires(preun): /sbin/install-info
 BuildRoot:	%(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX)
 BuildRequires: pcre-devel >= 3.9-10, texinfo, gettext
 BuildRequires: autoconf automake
 # Speedups DFA UTF-8 period patterns.
 # original name: 0001-dfa-optimize-UTF-8-period.patch
 Patch0: grep-2.6.3-dfa-optimize-period.patch
 # Speedups MBCSET ([a-z]) UTF-8 patterns.
 # original name: 0002-fall-back-to-glibc-matcher-if-a-MBCSET-is-found.patch
 Patch1: grep-2.6.3-glibc-matcher-fallback.patch
 # Deprecated --mmap option is now properly ignored.
 # 0003-grep-fix-mmap-not-being-ignored.patch
 Patch2: grep-2.6.3-mmap-option-fix.patch
 # Speedups -m and removes quadratic complexity when going to glibc.
 # 0004-dfa-convert-to-wide-character-line-by-line.patch
 Patch3: grep-2.6.3-dfa-convert-to-wide-char.patch
 # Speedups DFA [[:digit:]] and [[:xdigit:]] patterns.
 # 0005-dfa-speed-up-digit-and-xdigit.patch
 Patch4: grep-2.6.3-dfa-speedup-digit-xdigit.patch
 %description
 The GNU versions of commonly used grep utilities. Grep searches through
@ -23,6 +38,11 @@ GNU grep is needed by many scripts, so it shall be installed on every system.
 %prep
 %setup -q
 %patch0 -p1 -b .dfa-optimize-period
 %patch1 -p1 -b .glibc-matcher-fallback
 %patch2 -p1 -b .mmap-option-fix
 %patch3 -p1 -b .dfa-convert-to-wide-char
 %patch4 -p1 -b .speedup-digit-xdigit
 %build
 %configure --without-included-regex CPPFLAGS="-I%{_includedir}/pcre"
@ -59,6 +79,14 @@ fi
 %{_mandir}/*/*
 %changelog
 * Tue May 06 2010 Jaroslav Škarvada <jskarvad@redhat.com> - 2.6.3-2
 - Added dfa-optimize-period patch (speedup for . patterns in UTF-8)
 - Added glibc-matcher-fallback patch (speedup for [a-z] patterns in UTF-8)
 - Added mmap-option-fix patch
 - Added dfa-convert-to-wide-char patch (speedup for -m and remove quadratic
  complexity when going to glibc)
 - Added dfa-speedup-digit-xdigit patch (speedup for [[:digit:]] [:xdigit:]])
 * Sun Apr 04 2010 Jaroslav Škarvada <jskarvad@redhat.com> - 2.6.3-1
 - New version: grep-2.6.3
 - make check is not silent now
@ -77,7 +105,7 @@ fi
 - Added w patch to fix -w switch behaviour broken by dfa-optional patch
 * Wed Feb 10 2010 Jaroslav Škarvada <jskarvad@redhat.com> - 2.5.4-1
- New version: grep-2.5.4 (#502931)
+- New version: grep-2.5.4
 - Fixed typos in %description
 - Updated utf-8 patch
 - Added dfa-optional patch (#538423)