- Added dfa-optimize-period patch (speedup for . patterns in UTF-8)
- Added glibc-matcher-fallback patch (speedup for [a-z] patterns in UTF-8) - Added mmap-option-fix patch - Added dfa-convert-to-wide-char patch (speedup for -m and remove quadratic complexity when going to glibc) - Added dfa-speedup-digit-xdigit patch (speedup for [[:digit:]] [:xdigit:]])
This commit is contained in:
parent
532439cc73
commit
6ccb1073e9
162
grep-2.6.3-dfa-convert-to-wide-char.patch
Normal file
162
grep-2.6.3-dfa-convert-to-wide-char.patch
Normal file
@ -0,0 +1,162 @@
|
|||||||
|
From ff191d4667709b52758fcc5bdc568726d1616be4 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Paolo Bonzini <bonzini@gnu.org>
|
||||||
|
Date: Tue, 4 May 2010 17:26:09 +0200
|
||||||
|
Subject: [PATCH] dfa: convert to wide character line-by-line
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
This provides a nice speedup for -m in general, but especially
|
||||||
|
it avoids quadratic complexity in case we have to go to glibc.
|
||||||
|
|
||||||
|
Testcases:
|
||||||
|
|
||||||
|
# From upstream backref-multibyte-slow
|
||||||
|
yes aba | sed 10000q > aba.txt
|
||||||
|
time ./egrep -c '^([a-z]).\1$' aba.txt
|
||||||
|
|
||||||
|
# From rbiba
|
||||||
|
time grep '^[a-f][h-j][l-ž]$' cestina-sorted.txt
|
||||||
|
|
||||||
|
* src/dfa.c (prepare_wc_buf): Extract out of dfaexec. Convert
|
||||||
|
only up to the next newline.
|
||||||
|
(dfaexec): Exit multibyte processing loop if past buf_end.
|
||||||
|
Call prepare_wc_buf again after processing a newline.
|
||||||
|
---
|
||||||
|
src/dfa.c | 96 +++++++++++++++++++++++++++++++++++++-----------------------
|
||||||
|
1 files changed, 59 insertions(+), 37 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/src/dfa.c b/src/dfa.c
|
||||||
|
index 523fe05..70aa5a8 100644
|
||||||
|
--- a/src/dfa.c
|
||||||
|
+++ b/src/dfa.c
|
||||||
|
@@ -2824,6 +2824,53 @@ transit_state (struct dfa *d, int s, unsigned char const **pp)
|
||||||
|
|
||||||
|
#endif /* MBS_SUPPORT */
|
||||||
|
|
||||||
|
+/* Initialize mblen_buf and inputwcs with data from the next line. */
|
||||||
|
+
|
||||||
|
+static void
|
||||||
|
+prepare_wc_buf (const char *begin, const char *end)
|
||||||
|
+{
|
||||||
|
+ unsigned char eol = eolbyte;
|
||||||
|
+ size_t remain_bytes, i;
|
||||||
|
+
|
||||||
|
+ buf_begin = (unsigned char *) begin;
|
||||||
|
+
|
||||||
|
+ remain_bytes = 0;
|
||||||
|
+ for (i = 0; i < end - begin + 1; i++)
|
||||||
|
+ {
|
||||||
|
+ if (remain_bytes == 0)
|
||||||
|
+ {
|
||||||
|
+ remain_bytes
|
||||||
|
+ = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
|
||||||
|
+ if (remain_bytes < 1
|
||||||
|
+ || remain_bytes == (size_t) -1
|
||||||
|
+ || remain_bytes == (size_t) -2
|
||||||
|
+ || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
|
||||||
|
+ {
|
||||||
|
+ remain_bytes = 0;
|
||||||
|
+ inputwcs[i] = (wchar_t)begin[i];
|
||||||
|
+ mblen_buf[i] = 0;
|
||||||
|
+ if (begin[i] == eol)
|
||||||
|
+ break;
|
||||||
|
+ }
|
||||||
|
+ else
|
||||||
|
+ {
|
||||||
|
+ mblen_buf[i] = remain_bytes;
|
||||||
|
+ remain_bytes--;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+ else
|
||||||
|
+ {
|
||||||
|
+ mblen_buf[i] = remain_bytes;
|
||||||
|
+ inputwcs[i] = 0;
|
||||||
|
+ remain_bytes--;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ buf_end = (unsigned char *) (begin + i);
|
||||||
|
+ mblen_buf[i] = 0;
|
||||||
|
+ inputwcs[i] = 0; /* sentinel */
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
/* Search through a buffer looking for a match to the given struct dfa.
|
||||||
|
Find the first occurrence of a string matching the regexp in the
|
||||||
|
buffer, and the shortest possible version thereof. Return a pointer to
|
||||||
|
@@ -2870,43 +2917,10 @@ dfaexec (struct dfa *d, char const *begin, char *end,
|
||||||
|
#ifdef MBS_SUPPORT
|
||||||
|
if (d->mb_cur_max > 1)
|
||||||
|
{
|
||||||
|
- int remain_bytes, i;
|
||||||
|
- buf_begin = (unsigned char *) begin;
|
||||||
|
- buf_end = (unsigned char *) end;
|
||||||
|
-
|
||||||
|
- /* initialize mblen_buf, and inputwcs. */
|
||||||
|
MALLOC(mblen_buf, unsigned char, end - begin + 2);
|
||||||
|
MALLOC(inputwcs, wchar_t, end - begin + 2);
|
||||||
|
memset(&mbs, 0, sizeof(mbstate_t));
|
||||||
|
- remain_bytes = 0;
|
||||||
|
- for (i = 0; i < end - begin + 1; i++)
|
||||||
|
- {
|
||||||
|
- if (remain_bytes == 0)
|
||||||
|
- {
|
||||||
|
- remain_bytes
|
||||||
|
- = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
|
||||||
|
- if (remain_bytes < 1
|
||||||
|
- || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
|
||||||
|
- {
|
||||||
|
- remain_bytes = 0;
|
||||||
|
- inputwcs[i] = (wchar_t)begin[i];
|
||||||
|
- mblen_buf[i] = 0;
|
||||||
|
- }
|
||||||
|
- else
|
||||||
|
- {
|
||||||
|
- mblen_buf[i] = remain_bytes;
|
||||||
|
- remain_bytes--;
|
||||||
|
- }
|
||||||
|
- }
|
||||||
|
- else
|
||||||
|
- {
|
||||||
|
- mblen_buf[i] = remain_bytes;
|
||||||
|
- inputwcs[i] = 0;
|
||||||
|
- remain_bytes--;
|
||||||
|
- }
|
||||||
|
- }
|
||||||
|
- mblen_buf[i] = 0;
|
||||||
|
- inputwcs[i] = 0; /* sentinel */
|
||||||
|
+ prepare_wc_buf (p, end);
|
||||||
|
}
|
||||||
|
#endif /* MBS_SUPPORT */
|
||||||
|
|
||||||
|
@@ -2916,7 +2930,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
|
||||||
|
if (d->mb_cur_max > 1)
|
||||||
|
while ((t = trans[s]))
|
||||||
|
{
|
||||||
|
- if ((char *) p > end)
|
||||||
|
+ if (p > buf_end)
|
||||||
|
break;
|
||||||
|
s1 = s;
|
||||||
|
SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
|
||||||
|
@@ -2985,8 +2999,16 @@ dfaexec (struct dfa *d, char const *begin, char *end,
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If the previous character was a newline, count it. */
|
||||||
|
- if (count && (char *) p <= end && p[-1] == eol)
|
||||||
|
- ++*count;
|
||||||
|
+ if ((char *) p <= end && p[-1] == eol)
|
||||||
|
+ {
|
||||||
|
+ if (count)
|
||||||
|
+ ++*count;
|
||||||
|
+
|
||||||
|
+#ifdef MBS_SUPPORT
|
||||||
|
+ if (d->mb_cur_max > 1)
|
||||||
|
+ prepare_wc_buf (p, end);
|
||||||
|
+#endif
|
||||||
|
+ }
|
||||||
|
|
||||||
|
/* Check if we've run off the end of the buffer. */
|
||||||
|
if ((char *) p > end)
|
||||||
|
--
|
||||||
|
1.6.6.1
|
||||||
|
|
165
grep-2.6.3-dfa-optimize-period.patch
Normal file
165
grep-2.6.3-dfa-optimize-period.patch
Normal file
@ -0,0 +1,165 @@
|
|||||||
|
From 01422220ebf40f829c1f00418a96873b82f206ff Mon Sep 17 00:00:00 2001
|
||||||
|
From: Paolo Bonzini <bonzini@gnu.org>
|
||||||
|
Date: Mon, 19 Apr 2010 14:50:23 +0200
|
||||||
|
Subject: [PATCH 1/2] dfa: optimize UTF-8 period
|
||||||
|
|
||||||
|
Backport of upstream commits 7a0ad00 and 42ac56a.
|
||||||
|
|
||||||
|
* src/dfa.h (struct dfa): Add utf8_anychar_classes.
|
||||||
|
* src/dfa.c (add_utf8_anychar): New.
|
||||||
|
(atom): Simplify if/else nesting. Call add_utf8_anychar for ANYCHAR
|
||||||
|
in UTF-8 locales.
|
||||||
|
(dfaoptimize): Abort on ANYCHAR.
|
||||||
|
---
|
||||||
|
src/dfa.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++---------
|
||||||
|
src/dfa.h | 1 +
|
||||||
|
2 files changed, 82 insertions(+), 14 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/src/dfa.c b/src/dfa.c
|
||||||
|
index ba78b08..e13c361 100644
|
||||||
|
--- a/src/dfa.c
|
||||||
|
+++ b/src/dfa.c
|
||||||
|
@@ -1191,6 +1191,55 @@ addtok_wc (wint_t wc)
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
+static void
|
||||||
|
+add_utf8_anychar (void)
|
||||||
|
+{
|
||||||
|
+ static const charclass utf8_classes[5] = {
|
||||||
|
+ { 0, 0, 0, 0, ~0, ~0, 0, 0 }, /* 80-bf: non-lead bytes */
|
||||||
|
+ { ~0, ~0, ~0, ~0, ~0, ~0, 0, 0xff000000 }, /* 00-bf, f8-ff: 1-byte/invalid */
|
||||||
|
+ { 0, 0, 0, 0, 0, 0, ~0, 0 }, /* c0-df: 2-byte sequence */
|
||||||
|
+ { 0, 0, 0, 0, 0, 0, 0, 0xffff }, /* e0-ef: 3-byte sequence */
|
||||||
|
+ { 0, 0, 0, 0, 0, 0, 0, 0xff0000 } /* f0-f7: 4-byte sequence */
|
||||||
|
+ };
|
||||||
|
+ const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]);
|
||||||
|
+ unsigned int i;
|
||||||
|
+
|
||||||
|
+ /* Define the five character classes that are needed below. */
|
||||||
|
+ if (dfa->utf8_anychar_classes[0] == 0)
|
||||||
|
+ for (i = 0; i < n; i++)
|
||||||
|
+ {
|
||||||
|
+ charclass c;
|
||||||
|
+ memcpy (c, utf8_classes[i], sizeof c);
|
||||||
|
+ if (i == 1)
|
||||||
|
+ {
|
||||||
|
+ if (!(syntax_bits & RE_DOT_NEWLINE))
|
||||||
|
+ clrbit (eolbyte, c);
|
||||||
|
+ if (syntax_bits & RE_DOT_NOT_NULL)
|
||||||
|
+ clrbit ('\0', c);
|
||||||
|
+ }
|
||||||
|
+ dfa->utf8_anychar_classes[i] = CSET + charclass_index(c);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ /* A valid UTF-8 character is
|
||||||
|
+
|
||||||
|
+ ([0x00-0x7f]
|
||||||
|
+ |[0xc2-0xdf][0x80-0xbf]
|
||||||
|
+ |[0xe0-0xef[0x80-0xbf][0x80-0xbf]
|
||||||
|
+ |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf])
|
||||||
|
+
|
||||||
|
+ which I'll write more concisely "B|CA|DAA|EAAA". Factor the [0x80-0xbf]
|
||||||
|
+ and you get "B|(C|(D|EA)A)A". And since the token buffer is in reverse
|
||||||
|
+ Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR". */
|
||||||
|
+ for (i = 1; i < n; i++)
|
||||||
|
+ addtok (dfa->utf8_anychar_classes[i]);
|
||||||
|
+ while (--i > 1)
|
||||||
|
+ {
|
||||||
|
+ addtok (dfa->utf8_anychar_classes[0]);
|
||||||
|
+ addtok (CAT);
|
||||||
|
+ addtok (OR);
|
||||||
|
+ }
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
/* The grammar understood by the parser is as follows.
|
||||||
|
|
||||||
|
regexp:
|
||||||
|
@@ -1229,8 +1278,12 @@ addtok_wc (wint_t wc)
|
||||||
|
static void
|
||||||
|
atom (void)
|
||||||
|
{
|
||||||
|
+ if (0)
|
||||||
|
+ {
|
||||||
|
+ /* empty */
|
||||||
|
+ }
|
||||||
|
#ifdef MBS_SUPPORT
|
||||||
|
- if (tok == WCHAR)
|
||||||
|
+ else if (tok == WCHAR)
|
||||||
|
{
|
||||||
|
addtok_wc (case_fold ? towlower(wctok) : wctok);
|
||||||
|
#ifndef GREP
|
||||||
|
@@ -1242,16 +1295,28 @@ atom (void)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
tok = lex();
|
||||||
|
- return;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ else if (tok == ANYCHAR && using_utf8())
|
||||||
|
+ {
|
||||||
|
+ /* For UTF-8 expand the period to a series of CSETs that define a valid
|
||||||
|
+ UTF-8 character. This avoids using the slow multibyte path. I'm
|
||||||
|
+ pretty sure it would be both profitable and correct to do it for
|
||||||
|
+ any encoding; however, the optimization must be done manually as
|
||||||
|
+ it is done above in add_utf8_anychar. So, let's start with
|
||||||
|
+ UTF-8: it is the most used, and the structure of the encoding
|
||||||
|
+ makes the correctness more obvious. */
|
||||||
|
+ add_utf8_anychar();
|
||||||
|
+ tok = lex();
|
||||||
|
}
|
||||||
|
#endif /* MBS_SUPPORT */
|
||||||
|
|
||||||
|
- if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
|
||||||
|
- || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
|
||||||
|
+ else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
|
||||||
|
+ || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
|
||||||
|
#ifdef MBS_SUPPORT
|
||||||
|
- || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */
|
||||||
|
+ || tok == ANYCHAR || tok == MBCSET
|
||||||
|
#endif /* MBS_SUPPORT */
|
||||||
|
- || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
|
||||||
|
+ || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
|
||||||
|
{
|
||||||
|
addtok(tok);
|
||||||
|
tok = lex();
|
||||||
|
@@ -3027,14 +3092,16 @@ dfaoptimize (struct dfa *d)
|
||||||
|
for (i = 0; i < d->tindex; ++i)
|
||||||
|
{
|
||||||
|
switch(d->tokens[i])
|
||||||
|
- {
|
||||||
|
- case ANYCHAR:
|
||||||
|
- case MBCSET:
|
||||||
|
- /* Requires multi-byte algorithm. */
|
||||||
|
- return;
|
||||||
|
- default:
|
||||||
|
- break;
|
||||||
|
- }
|
||||||
|
+ {
|
||||||
|
+ case ANYCHAR:
|
||||||
|
+ /* Lowered. */
|
||||||
|
+ abort ();
|
||||||
|
+ case MBCSET:
|
||||||
|
+ /* Requires multi-byte algorithm. */
|
||||||
|
+ return;
|
||||||
|
+ default:
|
||||||
|
+ break;
|
||||||
|
+ }
|
||||||
|
}
|
||||||
|
|
||||||
|
free_mbdata (d);
|
||||||
|
diff --git a/src/dfa.h b/src/dfa.h
|
||||||
|
index 1c85207..42c177a 100644
|
||||||
|
--- a/src/dfa.h
|
||||||
|
+++ b/src/dfa.h
|
||||||
|
@@ -283,6 +283,7 @@ struct dfa
|
||||||
|
with dfaparse(). */
|
||||||
|
#ifdef MBS_SUPPORT
|
||||||
|
unsigned int mb_cur_max; /* Cached value of MB_CUR_MAX. */
|
||||||
|
+ int utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
|
||||||
|
|
||||||
|
/* The following are used only if MB_CUR_MAX > 1. */
|
||||||
|
|
||||||
|
--
|
||||||
|
1.6.6.1
|
||||||
|
|
114
grep-2.6.3-dfa-speedup-digit-xdigit.patch
Normal file
114
grep-2.6.3-dfa-speedup-digit-xdigit.patch
Normal file
@ -0,0 +1,114 @@
|
|||||||
|
From ebca24d6c9eb12f91eed3993de65945ee97dd467 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Paolo Bonzini <bonzini@gnu.org>
|
||||||
|
Date: Tue, 4 May 2010 18:07:28 +0200
|
||||||
|
Subject: [PATCH] dfa: speed up [[:digit:]] and [[:xdigit:]]
|
||||||
|
|
||||||
|
There's no "multibyte pain" in these two classes, since POSIX
|
||||||
|
and ISO C99 mandate their contents.
|
||||||
|
|
||||||
|
Time for "./grep -x '[[:digit:]]' /usr/share/dict/linux.words"
|
||||||
|
Before: 1.5s, after: 0.07s. (sed manages only 0.5s).
|
||||||
|
|
||||||
|
* src/dfa.c (predicates): Declare struct dfa_ctype separately
|
||||||
|
from definition. Add sb_only.
|
||||||
|
(find_pred): Return const struct dfa_ctype *.
|
||||||
|
(parse_bracket_exp): Return const struct dfa_ctype *. Do
|
||||||
|
not fill MBCSET for sb_only character types.
|
||||||
|
---
|
||||||
|
src/dfa.c | 55 ++++++++++++++++++++++++++++---------------------------
|
||||||
|
1 files changed, 28 insertions(+), 27 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/src/dfa.c b/src/dfa.c
|
||||||
|
index 4dd26c9..da5a306 100644
|
||||||
|
--- a/src/dfa.c
|
||||||
|
+++ b/src/dfa.c
|
||||||
|
@@ -429,26 +429,29 @@ typedef int predicate (int);
|
||||||
|
/* The following list maps the names of the Posix named character classes
|
||||||
|
to predicate functions that determine whether a given character is in
|
||||||
|
the class. The leading [ has already been eaten by the lexical analyzer. */
|
||||||
|
-static struct {
|
||||||
|
+struct dfa_ctype {
|
||||||
|
const char *name;
|
||||||
|
- predicate *pred;
|
||||||
|
-} const prednames[] = {
|
||||||
|
- { "alpha", is_alpha },
|
||||||
|
- { "upper", is_upper },
|
||||||
|
- { "lower", is_lower },
|
||||||
|
- { "digit", is_digit },
|
||||||
|
- { "xdigit", is_xdigit },
|
||||||
|
- { "space", is_space },
|
||||||
|
- { "punct", is_punct },
|
||||||
|
- { "alnum", is_alnum },
|
||||||
|
- { "print", is_print },
|
||||||
|
- { "graph", is_graph },
|
||||||
|
- { "cntrl", is_cntrl },
|
||||||
|
- { "blank", is_blank },
|
||||||
|
- { NULL, NULL }
|
||||||
|
+ predicate *func;
|
||||||
|
+ bool sb_only;
|
||||||
|
};
|
||||||
|
|
||||||
|
-static predicate *
|
||||||
|
+static const struct dfa_ctype prednames[] = {
|
||||||
|
+ { "alpha", isalpha, false },
|
||||||
|
+ { "upper", isupper, false },
|
||||||
|
+ { "lower", islower, false },
|
||||||
|
+ { "digit", isdigit, true },
|
||||||
|
+ { "xdigit", isxdigit, true },
|
||||||
|
+ { "space", isspace, false },
|
||||||
|
+ { "punct", ispunct, false },
|
||||||
|
+ { "alnum", isalnum, false },
|
||||||
|
+ { "print", isprint, false },
|
||||||
|
+ { "graph", isgraph, false },
|
||||||
|
+ { "cntrl", iscntrl, false },
|
||||||
|
+ { "blank", isblank, false },
|
||||||
|
+ { NULL, NULL, false }
|
||||||
|
+};
|
||||||
|
+
|
||||||
|
+static const struct dfa_ctype *
|
||||||
|
find_pred (const char *str)
|
||||||
|
{
|
||||||
|
unsigned int i;
|
||||||
|
@@ -456,7 +459,7 @@ find_pred (const char *str)
|
||||||
|
if (!strcmp(str, prednames[i].name))
|
||||||
|
break;
|
||||||
|
|
||||||
|
- return prednames[i].pred;
|
||||||
|
+ return &prednames[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Multibyte character handling sub-routine for lex.
|
||||||
|
@@ -553,8 +556,11 @@ parse_bracket_exp (void)
|
||||||
|
|| !strcmp (str, "lower"))
|
||||||
|
? "alpha"
|
||||||
|
: str);
|
||||||
|
+ const struct dfa_ctype *pred = find_pred (class);
|
||||||
|
+ if (!pred)
|
||||||
|
+ dfaerror(_("invalid character class"));
|
||||||
|
#ifdef MBS_SUPPORT
|
||||||
|
- if (MB_CUR_MAX > 1)
|
||||||
|
+ if (MB_CUR_MAX > 1 && !pred->sb_only)
|
||||||
|
{
|
||||||
|
/* Store the character class as wctype_t. */
|
||||||
|
wctype_t wt = wctype (class);
|
||||||
|
@@ -568,14 +574,9 @@ parse_bracket_exp (void)
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
- {
|
||||||
|
- predicate *pred = find_pred (class);
|
||||||
|
- if (!pred)
|
||||||
|
- dfaerror(_("invalid character class"));
|
||||||
|
- for (c2 = 0; c2 < NOTCHAR; ++c2)
|
||||||
|
- if ((*pred)(c2))
|
||||||
|
- setbit_case_fold (c2, ccl);
|
||||||
|
- }
|
||||||
|
+ for (c2 = 0; c2 < NOTCHAR; ++c2)
|
||||||
|
+ if (pred->func(c2))
|
||||||
|
+ setbit_case_fold (c2, ccl);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef MBS_SUPPORT
|
||||||
|
--
|
||||||
|
1.6.6.1
|
||||||
|
|
38
grep-2.6.3-glibc-matcher-fallback.patch
Normal file
38
grep-2.6.3-glibc-matcher-fallback.patch
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
From 3fca11d78cfa1fec6199936d57871b9db08226ab Mon Sep 17 00:00:00 2001
|
||||||
|
From: Paolo Bonzini <bonzini@gnu.org>
|
||||||
|
Date: Thu, 29 Apr 2010 17:13:32 +0200
|
||||||
|
Subject: [PATCH 2/2] fall back to glibc matcher if a MBCSET is found
|
||||||
|
|
||||||
|
This patch works around the performance problems of multibyte grep
|
||||||
|
upstream.
|
||||||
|
|
||||||
|
For UTF-8 it should trigger only in the presence of MBCSET, e.g. [a-z].
|
||||||
|
|
||||||
|
For other character sets all brackets and `.` as well will trigger it.
|
||||||
|
---
|
||||||
|
src/dfa.c | 9 +++++++++
|
||||||
|
1 files changed, 9 insertions(+), 0 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/src/dfa.c b/src/dfa.c
|
||||||
|
index e13c361..523fe05 100644
|
||||||
|
--- a/src/dfa.c
|
||||||
|
+++ b/src/dfa.c
|
||||||
|
@@ -2927,6 +2927,15 @@ dfaexec (struct dfa *d, char const *begin, char *end,
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
+ if (backref)
|
||||||
|
+ {
|
||||||
|
+ *backref = 1;
|
||||||
|
+ free(mblen_buf);
|
||||||
|
+ free(inputwcs);
|
||||||
|
+ *end = saved_end;
|
||||||
|
+ return (char *) p;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
/* Can match with a multibyte character (and multi character
|
||||||
|
collating element). Transition table might be updated. */
|
||||||
|
s = transit_state(d, s, &p);
|
||||||
|
--
|
||||||
|
1.6.6.1
|
||||||
|
|
26
grep-2.6.3-mmap-option-fix.patch
Normal file
26
grep-2.6.3-mmap-option-fix.patch
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
From dfa2891e473abdb5507fff65002f946b60145f44 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Paolo Bonzini <bonzini@gnu.org>
|
||||||
|
Date: Tue, 20 Apr 2010 12:32:22 +0200
|
||||||
|
Subject: [PATCH] grep: fix --mmap not being ignored
|
||||||
|
|
||||||
|
* NEWS: Document bugfix.
|
||||||
|
* main.c (main): Ignore MMAP_OPTION.
|
||||||
|
---
|
||||||
|
src/main.c | 1 +
|
||||||
|
1 files changed, 1 insertions(+), 0 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/src/main.c b/src/main.c
|
||||||
|
index 1697c80..0ccf6eb 100644
|
||||||
|
--- a/src/main.c
|
||||||
|
+++ b/src/main.c
|
||||||
|
@@ -2061,6 +2061,7 @@ main (int argc, char **argv)
|
||||||
|
label = optarg;
|
||||||
|
break;
|
||||||
|
|
||||||
|
+ case MMAP_OPTION:
|
||||||
|
case 0:
|
||||||
|
/* long options */
|
||||||
|
break;
|
||||||
|
--
|
||||||
|
1.6.6.1
|
||||||
|
|
32
grep.spec
32
grep.spec
@ -3,7 +3,7 @@
|
|||||||
Summary: Pattern matching utilities
|
Summary: Pattern matching utilities
|
||||||
Name: grep
|
Name: grep
|
||||||
Version: 2.6.3
|
Version: 2.6.3
|
||||||
Release: 1%{?dist}
|
Release: 2%{?dist}
|
||||||
License: GPLv3+
|
License: GPLv3+
|
||||||
Group: Applications/Text
|
Group: Applications/Text
|
||||||
Source: ftp://ftp.gnu.org/pub/gnu/grep/grep-%{version}.tar.xz
|
Source: ftp://ftp.gnu.org/pub/gnu/grep/grep-%{version}.tar.xz
|
||||||
@ -13,6 +13,21 @@ Requires(preun): /sbin/install-info
|
|||||||
BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX)
|
BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX)
|
||||||
BuildRequires: pcre-devel >= 3.9-10, texinfo, gettext
|
BuildRequires: pcre-devel >= 3.9-10, texinfo, gettext
|
||||||
BuildRequires: autoconf automake
|
BuildRequires: autoconf automake
|
||||||
|
# Speedups DFA UTF-8 period patterns.
|
||||||
|
# original name: 0001-dfa-optimize-UTF-8-period.patch
|
||||||
|
Patch0: grep-2.6.3-dfa-optimize-period.patch
|
||||||
|
# Speedups MBCSET ([a-z]) UTF-8 patterns.
|
||||||
|
# original name: 0002-fall-back-to-glibc-matcher-if-a-MBCSET-is-found.patch
|
||||||
|
Patch1: grep-2.6.3-glibc-matcher-fallback.patch
|
||||||
|
# Deprecated --mmap option is now properly ignored.
|
||||||
|
# 0003-grep-fix-mmap-not-being-ignored.patch
|
||||||
|
Patch2: grep-2.6.3-mmap-option-fix.patch
|
||||||
|
# Speedups -m and removes quadratic complexity when going to glibc.
|
||||||
|
# 0004-dfa-convert-to-wide-character-line-by-line.patch
|
||||||
|
Patch3: grep-2.6.3-dfa-convert-to-wide-char.patch
|
||||||
|
# Speedups DFA [[:digit:]] and [[:xdigit:]] patterns.
|
||||||
|
# 0005-dfa-speed-up-digit-and-xdigit.patch
|
||||||
|
Patch4: grep-2.6.3-dfa-speedup-digit-xdigit.patch
|
||||||
|
|
||||||
%description
|
%description
|
||||||
The GNU versions of commonly used grep utilities. Grep searches through
|
The GNU versions of commonly used grep utilities. Grep searches through
|
||||||
@ -23,6 +38,11 @@ GNU grep is needed by many scripts, so it shall be installed on every system.
|
|||||||
|
|
||||||
%prep
|
%prep
|
||||||
%setup -q
|
%setup -q
|
||||||
|
%patch0 -p1 -b .dfa-optimize-period
|
||||||
|
%patch1 -p1 -b .glibc-matcher-fallback
|
||||||
|
%patch2 -p1 -b .mmap-option-fix
|
||||||
|
%patch3 -p1 -b .dfa-convert-to-wide-char
|
||||||
|
%patch4 -p1 -b .speedup-digit-xdigit
|
||||||
|
|
||||||
%build
|
%build
|
||||||
%configure --without-included-regex CPPFLAGS="-I%{_includedir}/pcre"
|
%configure --without-included-regex CPPFLAGS="-I%{_includedir}/pcre"
|
||||||
@ -59,6 +79,14 @@ fi
|
|||||||
%{_mandir}/*/*
|
%{_mandir}/*/*
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
* Tue May 06 2010 Jaroslav Škarvada <jskarvad@redhat.com> - 2.6.3-2
|
||||||
|
- Added dfa-optimize-period patch (speedup for . patterns in UTF-8)
|
||||||
|
- Added glibc-matcher-fallback patch (speedup for [a-z] patterns in UTF-8)
|
||||||
|
- Added mmap-option-fix patch
|
||||||
|
- Added dfa-convert-to-wide-char patch (speedup for -m and remove quadratic
|
||||||
|
complexity when going to glibc)
|
||||||
|
- Added dfa-speedup-digit-xdigit patch (speedup for [[:digit:]] [:xdigit:]])
|
||||||
|
|
||||||
* Sun Apr 04 2010 Jaroslav Škarvada <jskarvad@redhat.com> - 2.6.3-1
|
* Sun Apr 04 2010 Jaroslav Škarvada <jskarvad@redhat.com> - 2.6.3-1
|
||||||
- New version: grep-2.6.3
|
- New version: grep-2.6.3
|
||||||
- make check is not silent now
|
- make check is not silent now
|
||||||
@ -77,7 +105,7 @@ fi
|
|||||||
- Added w patch to fix -w switch behaviour broken by dfa-optional patch
|
- Added w patch to fix -w switch behaviour broken by dfa-optional patch
|
||||||
|
|
||||||
* Wed Feb 10 2010 Jaroslav Škarvada <jskarvad@redhat.com> - 2.5.4-1
|
* Wed Feb 10 2010 Jaroslav Škarvada <jskarvad@redhat.com> - 2.5.4-1
|
||||||
- New version: grep-2.5.4 (#502931)
|
- New version: grep-2.5.4
|
||||||
- Fixed typos in %description
|
- Fixed typos in %description
|
||||||
- Updated utf-8 patch
|
- Updated utf-8 patch
|
||||||
- Added dfa-optional patch (#538423)
|
- Added dfa-optional patch (#538423)
|
||||||
|
Loading…
Reference in New Issue
Block a user