- Added dfa-optimize-period patch (speedup for . patterns in UTF-8)

- Added glibc-matcher-fallback patch (speedup for [a-z] patterns in UTF-8)
- Added mmap-option-fix patch
- Added dfa-convert-to-wide-char patch (speedup for -m and remove quadratic
    complexity when going to glibc)
- Added dfa-speedup-digit-xdigit patch (speedup for [[:digit:]]
    [:xdigit:]])
This commit is contained in:
Jaroslav Škarvada 2010-05-07 11:34:53 +00:00
parent 532439cc73
commit 6ccb1073e9
6 changed files with 535 additions and 2 deletions

View File

@ -0,0 +1,162 @@
From ff191d4667709b52758fcc5bdc568726d1616be4 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <bonzini@gnu.org>
Date: Tue, 4 May 2010 17:26:09 +0200
Subject: [PATCH] dfa: convert to wide character line-by-line
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This provides a nice speedup for -m in general, but especially
it avoids quadratic complexity in case we have to go to glibc.
Testcases:
# From upstream backref-multibyte-slow
yes aba | sed 10000q > aba.txt
time ./egrep -c '^([a-z]).\1$' aba.txt
# From rbiba
time grep '^[a-f][h-j][l-ž]$' cestina-sorted.txt
* src/dfa.c (prepare_wc_buf): Extract out of dfaexec. Convert
only up to the next newline.
(dfaexec): Exit multibyte processing loop if past buf_end.
Call prepare_wc_buf again after processing a newline.
---
src/dfa.c | 96 +++++++++++++++++++++++++++++++++++++-----------------------
1 files changed, 59 insertions(+), 37 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index 523fe05..70aa5a8 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -2824,6 +2824,53 @@ transit_state (struct dfa *d, int s, unsigned char const **pp)
#endif /* MBS_SUPPORT */
+/* Initialize mblen_buf and inputwcs with data from the next line. */
+
+static void
+prepare_wc_buf (const char *begin, const char *end)
+{
+ unsigned char eol = eolbyte;
+ size_t remain_bytes, i;
+
+ buf_begin = (unsigned char *) begin;
+
+ remain_bytes = 0;
+ for (i = 0; i < end - begin + 1; i++)
+ {
+ if (remain_bytes == 0)
+ {
+ remain_bytes
+ = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
+ if (remain_bytes < 1
+ || remain_bytes == (size_t) -1
+ || remain_bytes == (size_t) -2
+ || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
+ {
+ remain_bytes = 0;
+ inputwcs[i] = (wchar_t)begin[i];
+ mblen_buf[i] = 0;
+ if (begin[i] == eol)
+ break;
+ }
+ else
+ {
+ mblen_buf[i] = remain_bytes;
+ remain_bytes--;
+ }
+ }
+ else
+ {
+ mblen_buf[i] = remain_bytes;
+ inputwcs[i] = 0;
+ remain_bytes--;
+ }
+ }
+
+ buf_end = (unsigned char *) (begin + i);
+ mblen_buf[i] = 0;
+ inputwcs[i] = 0; /* sentinel */
+}
+
/* Search through a buffer looking for a match to the given struct dfa.
Find the first occurrence of a string matching the regexp in the
buffer, and the shortest possible version thereof. Return a pointer to
@@ -2870,43 +2917,10 @@ dfaexec (struct dfa *d, char const *begin, char *end,
#ifdef MBS_SUPPORT
if (d->mb_cur_max > 1)
{
- int remain_bytes, i;
- buf_begin = (unsigned char *) begin;
- buf_end = (unsigned char *) end;
-
- /* initialize mblen_buf, and inputwcs. */
MALLOC(mblen_buf, unsigned char, end - begin + 2);
MALLOC(inputwcs, wchar_t, end - begin + 2);
memset(&mbs, 0, sizeof(mbstate_t));
- remain_bytes = 0;
- for (i = 0; i < end - begin + 1; i++)
- {
- if (remain_bytes == 0)
- {
- remain_bytes
- = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
- if (remain_bytes < 1
- || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
- {
- remain_bytes = 0;
- inputwcs[i] = (wchar_t)begin[i];
- mblen_buf[i] = 0;
- }
- else
- {
- mblen_buf[i] = remain_bytes;
- remain_bytes--;
- }
- }
- else
- {
- mblen_buf[i] = remain_bytes;
- inputwcs[i] = 0;
- remain_bytes--;
- }
- }
- mblen_buf[i] = 0;
- inputwcs[i] = 0; /* sentinel */
+ prepare_wc_buf (p, end);
}
#endif /* MBS_SUPPORT */
@@ -2916,7 +2930,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
if (d->mb_cur_max > 1)
while ((t = trans[s]))
{
- if ((char *) p > end)
+ if (p > buf_end)
break;
s1 = s;
SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
@@ -2985,8 +2999,16 @@ dfaexec (struct dfa *d, char const *begin, char *end,
}
/* If the previous character was a newline, count it. */
- if (count && (char *) p <= end && p[-1] == eol)
- ++*count;
+ if ((char *) p <= end && p[-1] == eol)
+ {
+ if (count)
+ ++*count;
+
+#ifdef MBS_SUPPORT
+ if (d->mb_cur_max > 1)
+ prepare_wc_buf (p, end);
+#endif
+ }
/* Check if we've run off the end of the buffer. */
if ((char *) p > end)
--
1.6.6.1

View File

@ -0,0 +1,165 @@
From 01422220ebf40f829c1f00418a96873b82f206ff Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <bonzini@gnu.org>
Date: Mon, 19 Apr 2010 14:50:23 +0200
Subject: [PATCH 1/2] dfa: optimize UTF-8 period
Backport of upstream commits 7a0ad00 and 42ac56a.
* src/dfa.h (struct dfa): Add utf8_anychar_classes.
* src/dfa.c (add_utf8_anychar): New.
(atom): Simplify if/else nesting. Call add_utf8_anychar for ANYCHAR
in UTF-8 locales.
(dfaoptimize): Abort on ANYCHAR.
---
src/dfa.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++---------
src/dfa.h | 1 +
2 files changed, 82 insertions(+), 14 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index ba78b08..e13c361 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -1191,6 +1191,55 @@ addtok_wc (wint_t wc)
}
#endif
+static void
+add_utf8_anychar (void)
+{
+ static const charclass utf8_classes[5] = {
+ { 0, 0, 0, 0, ~0, ~0, 0, 0 }, /* 80-bf: non-lead bytes */
+ { ~0, ~0, ~0, ~0, ~0, ~0, 0, 0xff000000 }, /* 00-bf, f8-ff: 1-byte/invalid */
+ { 0, 0, 0, 0, 0, 0, ~0, 0 }, /* c0-df: 2-byte sequence */
+ { 0, 0, 0, 0, 0, 0, 0, 0xffff }, /* e0-ef: 3-byte sequence */
+ { 0, 0, 0, 0, 0, 0, 0, 0xff0000 } /* f0-f7: 4-byte sequence */
+ };
+ const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]);
+ unsigned int i;
+
+ /* Define the five character classes that are needed below. */
+ if (dfa->utf8_anychar_classes[0] == 0)
+ for (i = 0; i < n; i++)
+ {
+ charclass c;
+ memcpy (c, utf8_classes[i], sizeof c);
+ if (i == 1)
+ {
+ if (!(syntax_bits & RE_DOT_NEWLINE))
+ clrbit (eolbyte, c);
+ if (syntax_bits & RE_DOT_NOT_NULL)
+ clrbit ('\0', c);
+ }
+ dfa->utf8_anychar_classes[i] = CSET + charclass_index(c);
+ }
+
+ /* A valid UTF-8 character is
+
+ ([0x00-0x7f]
+ |[0xc2-0xdf][0x80-0xbf]
+ |[0xe0-0xef[0x80-0xbf][0x80-0xbf]
+ |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf])
+
+ which I'll write more concisely "B|CA|DAA|EAAA". Factor the [0x80-0xbf]
+ and you get "B|(C|(D|EA)A)A". And since the token buffer is in reverse
+ Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR". */
+ for (i = 1; i < n; i++)
+ addtok (dfa->utf8_anychar_classes[i]);
+ while (--i > 1)
+ {
+ addtok (dfa->utf8_anychar_classes[0]);
+ addtok (CAT);
+ addtok (OR);
+ }
+}
+
/* The grammar understood by the parser is as follows.
regexp:
@@ -1229,8 +1278,12 @@ addtok_wc (wint_t wc)
static void
atom (void)
{
+ if (0)
+ {
+ /* empty */
+ }
#ifdef MBS_SUPPORT
- if (tok == WCHAR)
+ else if (tok == WCHAR)
{
addtok_wc (case_fold ? towlower(wctok) : wctok);
#ifndef GREP
@@ -1242,16 +1295,28 @@ atom (void)
#endif
tok = lex();
- return;
+ }
+
+ else if (tok == ANYCHAR && using_utf8())
+ {
+ /* For UTF-8 expand the period to a series of CSETs that define a valid
+ UTF-8 character. This avoids using the slow multibyte path. I'm
+ pretty sure it would be both profitable and correct to do it for
+ any encoding; however, the optimization must be done manually as
+ it is done above in add_utf8_anychar. So, let's start with
+ UTF-8: it is the most used, and the structure of the encoding
+ makes the correctness more obvious. */
+ add_utf8_anychar();
+ tok = lex();
}
#endif /* MBS_SUPPORT */
- if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
- || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
+ else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
+ || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
#ifdef MBS_SUPPORT
- || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */
+ || tok == ANYCHAR || tok == MBCSET
#endif /* MBS_SUPPORT */
- || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
+ || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
{
addtok(tok);
tok = lex();
@@ -3027,14 +3092,16 @@ dfaoptimize (struct dfa *d)
for (i = 0; i < d->tindex; ++i)
{
switch(d->tokens[i])
- {
- case ANYCHAR:
- case MBCSET:
- /* Requires multi-byte algorithm. */
- return;
- default:
- break;
- }
+ {
+ case ANYCHAR:
+ /* Lowered. */
+ abort ();
+ case MBCSET:
+ /* Requires multi-byte algorithm. */
+ return;
+ default:
+ break;
+ }
}
free_mbdata (d);
diff --git a/src/dfa.h b/src/dfa.h
index 1c85207..42c177a 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -283,6 +283,7 @@ struct dfa
with dfaparse(). */
#ifdef MBS_SUPPORT
unsigned int mb_cur_max; /* Cached value of MB_CUR_MAX. */
+ int utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
/* The following are used only if MB_CUR_MAX > 1. */
--
1.6.6.1

View File

@ -0,0 +1,114 @@
From ebca24d6c9eb12f91eed3993de65945ee97dd467 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <bonzini@gnu.org>
Date: Tue, 4 May 2010 18:07:28 +0200
Subject: [PATCH] dfa: speed up [[:digit:]] and [[:xdigit:]]
There's no "multibyte pain" in these two classes, since POSIX
and ISO C99 mandate their contents.
Time for "./grep -x '[[:digit:]]' /usr/share/dict/linux.words"
Before: 1.5s, after: 0.07s. (sed manages only 0.5s).
* src/dfa.c (predicates): Declare struct dfa_ctype separately
from definition. Add sb_only.
(find_pred): Return const struct dfa_ctype *.
(parse_bracket_exp): Return const struct dfa_ctype *. Do
not fill MBCSET for sb_only character types.
---
src/dfa.c | 55 ++++++++++++++++++++++++++++---------------------------
1 files changed, 28 insertions(+), 27 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index 4dd26c9..da5a306 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -429,26 +429,29 @@ typedef int predicate (int);
/* The following list maps the names of the Posix named character classes
to predicate functions that determine whether a given character is in
the class. The leading [ has already been eaten by the lexical analyzer. */
-static struct {
+struct dfa_ctype {
const char *name;
- predicate *pred;
-} const prednames[] = {
- { "alpha", is_alpha },
- { "upper", is_upper },
- { "lower", is_lower },
- { "digit", is_digit },
- { "xdigit", is_xdigit },
- { "space", is_space },
- { "punct", is_punct },
- { "alnum", is_alnum },
- { "print", is_print },
- { "graph", is_graph },
- { "cntrl", is_cntrl },
- { "blank", is_blank },
- { NULL, NULL }
+ predicate *func;
+ bool sb_only;
};
-static predicate *
+static const struct dfa_ctype prednames[] = {
+ { "alpha", isalpha, false },
+ { "upper", isupper, false },
+ { "lower", islower, false },
+ { "digit", isdigit, true },
+ { "xdigit", isxdigit, true },
+ { "space", isspace, false },
+ { "punct", ispunct, false },
+ { "alnum", isalnum, false },
+ { "print", isprint, false },
+ { "graph", isgraph, false },
+ { "cntrl", iscntrl, false },
+ { "blank", isblank, false },
+ { NULL, NULL, false }
+};
+
+static const struct dfa_ctype *
find_pred (const char *str)
{
unsigned int i;
@@ -456,7 +459,7 @@ find_pred (const char *str)
if (!strcmp(str, prednames[i].name))
break;
- return prednames[i].pred;
+ return &prednames[i];
}
/* Multibyte character handling sub-routine for lex.
@@ -553,8 +556,11 @@ parse_bracket_exp (void)
|| !strcmp (str, "lower"))
? "alpha"
: str);
+ const struct dfa_ctype *pred = find_pred (class);
+ if (!pred)
+ dfaerror(_("invalid character class"));
#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1)
+ if (MB_CUR_MAX > 1 && !pred->sb_only)
{
/* Store the character class as wctype_t. */
wctype_t wt = wctype (class);
@@ -568,14 +574,9 @@ parse_bracket_exp (void)
}
#endif
- {
- predicate *pred = find_pred (class);
- if (!pred)
- dfaerror(_("invalid character class"));
- for (c2 = 0; c2 < NOTCHAR; ++c2)
- if ((*pred)(c2))
- setbit_case_fold (c2, ccl);
- }
+ for (c2 = 0; c2 < NOTCHAR; ++c2)
+ if (pred->func(c2))
+ setbit_case_fold (c2, ccl);
}
#ifdef MBS_SUPPORT
--
1.6.6.1

View File

@ -0,0 +1,38 @@
From 3fca11d78cfa1fec6199936d57871b9db08226ab Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <bonzini@gnu.org>
Date: Thu, 29 Apr 2010 17:13:32 +0200
Subject: [PATCH 2/2] fall back to glibc matcher if a MBCSET is found
This patch works around the performance problems of multibyte grep
upstream.
For UTF-8 it should trigger only in the presence of MBCSET, e.g. [a-z].
For other character sets all brackets and `.` as well will trigger it.
---
src/dfa.c | 9 +++++++++
1 files changed, 9 insertions(+), 0 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index e13c361..523fe05 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -2927,6 +2927,15 @@ dfaexec (struct dfa *d, char const *begin, char *end,
continue;
}
+ if (backref)
+ {
+ *backref = 1;
+ free(mblen_buf);
+ free(inputwcs);
+ *end = saved_end;
+ return (char *) p;
+ }
+
/* Can match with a multibyte character (and multi character
collating element). Transition table might be updated. */
s = transit_state(d, s, &p);
--
1.6.6.1

View File

@ -0,0 +1,26 @@
From dfa2891e473abdb5507fff65002f946b60145f44 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <bonzini@gnu.org>
Date: Tue, 20 Apr 2010 12:32:22 +0200
Subject: [PATCH] grep: fix --mmap not being ignored
* NEWS: Document bugfix.
* main.c (main): Ignore MMAP_OPTION.
---
src/main.c | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)
diff --git a/src/main.c b/src/main.c
index 1697c80..0ccf6eb 100644
--- a/src/main.c
+++ b/src/main.c
@@ -2061,6 +2061,7 @@ main (int argc, char **argv)
label = optarg;
break;
+ case MMAP_OPTION:
case 0:
/* long options */
break;
--
1.6.6.1

View File

@ -3,7 +3,7 @@
Summary: Pattern matching utilities
Name: grep
Version: 2.6.3
Release: 1%{?dist}
Release: 2%{?dist}
License: GPLv3+
Group: Applications/Text
Source: ftp://ftp.gnu.org/pub/gnu/grep/grep-%{version}.tar.xz
@ -13,6 +13,21 @@ Requires(preun): /sbin/install-info
BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX)
BuildRequires: pcre-devel >= 3.9-10, texinfo, gettext
BuildRequires: autoconf automake
# Speedups DFA UTF-8 period patterns.
# original name: 0001-dfa-optimize-UTF-8-period.patch
Patch0: grep-2.6.3-dfa-optimize-period.patch
# Speedups MBCSET ([a-z]) UTF-8 patterns.
# original name: 0002-fall-back-to-glibc-matcher-if-a-MBCSET-is-found.patch
Patch1: grep-2.6.3-glibc-matcher-fallback.patch
# Deprecated --mmap option is now properly ignored.
# 0003-grep-fix-mmap-not-being-ignored.patch
Patch2: grep-2.6.3-mmap-option-fix.patch
# Speedups -m and removes quadratic complexity when going to glibc.
# 0004-dfa-convert-to-wide-character-line-by-line.patch
Patch3: grep-2.6.3-dfa-convert-to-wide-char.patch
# Speedups DFA [[:digit:]] and [[:xdigit:]] patterns.
# 0005-dfa-speed-up-digit-and-xdigit.patch
Patch4: grep-2.6.3-dfa-speedup-digit-xdigit.patch
%description
The GNU versions of commonly used grep utilities. Grep searches through
@ -23,6 +38,11 @@ GNU grep is needed by many scripts, so it shall be installed on every system.
%prep
%setup -q
%patch0 -p1 -b .dfa-optimize-period
%patch1 -p1 -b .glibc-matcher-fallback
%patch2 -p1 -b .mmap-option-fix
%patch3 -p1 -b .dfa-convert-to-wide-char
%patch4 -p1 -b .speedup-digit-xdigit
%build
%configure --without-included-regex CPPFLAGS="-I%{_includedir}/pcre"
@ -59,6 +79,14 @@ fi
%{_mandir}/*/*
%changelog
* Tue May 06 2010 Jaroslav Škarvada <jskarvad@redhat.com> - 2.6.3-2
- Added dfa-optimize-period patch (speedup for . patterns in UTF-8)
- Added glibc-matcher-fallback patch (speedup for [a-z] patterns in UTF-8)
- Added mmap-option-fix patch
- Added dfa-convert-to-wide-char patch (speedup for -m and remove quadratic
complexity when going to glibc)
- Added dfa-speedup-digit-xdigit patch (speedup for [[:digit:]] [:xdigit:]])
* Sun Apr 04 2010 Jaroslav Škarvada <jskarvad@redhat.com> - 2.6.3-1
- New version: grep-2.6.3
- make check is not silent now
@ -77,7 +105,7 @@ fi
- Added w patch to fix -w switch behaviour broken by dfa-optional patch
* Wed Feb 10 2010 Jaroslav Škarvada <jskarvad@redhat.com> - 2.5.4-1
- New version: grep-2.5.4 (#502931)
- New version: grep-2.5.4
- Fixed typos in %description
- Updated utf-8 patch
- Added dfa-optional patch (#538423)