From 5ec5b7c5f8058589ad8b57648f8a22f8eddce142 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaroslav=20=C5=A0karvada?= Date: Tue, 21 Sep 2010 15:04:45 +0200 Subject: [PATCH] - New version: grep-2.7 - Removed patches (already in upstream): dfa-optimize-period, glibc-matcher-fallback, mmap-option-fix, dfa-convert-to-wide-char, dfa-speedup-digit-xdigit --- .gitignore | 1 + grep-2.6.3-dfa-convert-to-wide-char.patch | 162 --------------------- grep-2.6.3-dfa-optimize-period.patch | 165 ---------------------- grep-2.6.3-dfa-speedup-digit-xdigit.patch | 114 --------------- grep-2.6.3-glibc-matcher-fallback.patch | 38 ----- grep-2.6.3-mmap-option-fix.patch | 26 ---- grep.spec | 30 ++-- sources | 2 +- 8 files changed, 10 insertions(+), 528 deletions(-) delete mode 100644 grep-2.6.3-dfa-convert-to-wide-char.patch delete mode 100644 grep-2.6.3-dfa-optimize-period.patch delete mode 100644 grep-2.6.3-dfa-speedup-digit-xdigit.patch delete mode 100644 grep-2.6.3-glibc-matcher-fallback.patch delete mode 100644 grep-2.6.3-mmap-option-fix.patch diff --git a/.gitignore b/.gitignore index cba6b1a..4eacbbc 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ grep-2.6.3.tar.xz +/grep-2.7.tar.xz diff --git a/grep-2.6.3-dfa-convert-to-wide-char.patch b/grep-2.6.3-dfa-convert-to-wide-char.patch deleted file mode 100644 index 250b6c0..0000000 --- a/grep-2.6.3-dfa-convert-to-wide-char.patch +++ /dev/null @@ -1,162 +0,0 @@ -From ff191d4667709b52758fcc5bdc568726d1616be4 Mon Sep 17 00:00:00 2001 -From: Paolo Bonzini -Date: Tue, 4 May 2010 17:26:09 +0200 -Subject: [PATCH] dfa: convert to wide character line-by-line -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -This provides a nice speedup for -m in general, but especially -it avoids quadratic complexity in case we have to go to glibc. - -Testcases: - - # From upstream backref-multibyte-slow - yes aba | sed 10000q > aba.txt - time ./egrep -c '^([a-z]).\1$' aba.txt - - # From rbiba - time grep '^[a-f][h-j][l-ž]$' cestina-sorted.txt - -* src/dfa.c (prepare_wc_buf): Extract out of dfaexec. Convert -only up to the next newline. -(dfaexec): Exit multibyte processing loop if past buf_end. -Call prepare_wc_buf again after processing a newline. ---- - src/dfa.c | 96 +++++++++++++++++++++++++++++++++++++----------------------- - 1 files changed, 59 insertions(+), 37 deletions(-) - -diff --git a/src/dfa.c b/src/dfa.c -index 523fe05..70aa5a8 100644 ---- a/src/dfa.c -+++ b/src/dfa.c -@@ -2824,6 +2824,53 @@ transit_state (struct dfa *d, int s, unsigned char const **pp) - - #endif /* MBS_SUPPORT */ - -+/* Initialize mblen_buf and inputwcs with data from the next line. */ -+ -+static void -+prepare_wc_buf (const char *begin, const char *end) -+{ -+ unsigned char eol = eolbyte; -+ size_t remain_bytes, i; -+ -+ buf_begin = (unsigned char *) begin; -+ -+ remain_bytes = 0; -+ for (i = 0; i < end - begin + 1; i++) -+ { -+ if (remain_bytes == 0) -+ { -+ remain_bytes -+ = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs); -+ if (remain_bytes < 1 -+ || remain_bytes == (size_t) -1 -+ || remain_bytes == (size_t) -2 -+ || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i])) -+ { -+ remain_bytes = 0; -+ inputwcs[i] = (wchar_t)begin[i]; -+ mblen_buf[i] = 0; -+ if (begin[i] == eol) -+ break; -+ } -+ else -+ { -+ mblen_buf[i] = remain_bytes; -+ remain_bytes--; -+ } -+ } -+ else -+ { -+ mblen_buf[i] = remain_bytes; -+ inputwcs[i] = 0; -+ remain_bytes--; -+ } -+ } -+ -+ buf_end = (unsigned char *) (begin + i); -+ mblen_buf[i] = 0; -+ inputwcs[i] = 0; /* sentinel */ -+} -+ - /* Search through a buffer looking for a match to the given struct dfa. - Find the first occurrence of a string matching the regexp in the - buffer, and the shortest possible version thereof. Return a pointer to -@@ -2870,43 +2917,10 @@ dfaexec (struct dfa *d, char const *begin, char *end, - #ifdef MBS_SUPPORT - if (d->mb_cur_max > 1) - { -- int remain_bytes, i; -- buf_begin = (unsigned char *) begin; -- buf_end = (unsigned char *) end; -- -- /* initialize mblen_buf, and inputwcs. */ - MALLOC(mblen_buf, unsigned char, end - begin + 2); - MALLOC(inputwcs, wchar_t, end - begin + 2); - memset(&mbs, 0, sizeof(mbstate_t)); -- remain_bytes = 0; -- for (i = 0; i < end - begin + 1; i++) -- { -- if (remain_bytes == 0) -- { -- remain_bytes -- = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs); -- if (remain_bytes < 1 -- || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i])) -- { -- remain_bytes = 0; -- inputwcs[i] = (wchar_t)begin[i]; -- mblen_buf[i] = 0; -- } -- else -- { -- mblen_buf[i] = remain_bytes; -- remain_bytes--; -- } -- } -- else -- { -- mblen_buf[i] = remain_bytes; -- inputwcs[i] = 0; -- remain_bytes--; -- } -- } -- mblen_buf[i] = 0; -- inputwcs[i] = 0; /* sentinel */ -+ prepare_wc_buf (p, end); - } - #endif /* MBS_SUPPORT */ - -@@ -2916,7 +2930,7 @@ dfaexec (struct dfa *d, char const *begin, char *end, - if (d->mb_cur_max > 1) - while ((t = trans[s])) - { -- if ((char *) p > end) -+ if (p > buf_end) - break; - s1 = s; - SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); -@@ -2985,8 +2999,16 @@ dfaexec (struct dfa *d, char const *begin, char *end, - } - - /* If the previous character was a newline, count it. */ -- if (count && (char *) p <= end && p[-1] == eol) -- ++*count; -+ if ((char *) p <= end && p[-1] == eol) -+ { -+ if (count) -+ ++*count; -+ -+#ifdef MBS_SUPPORT -+ if (d->mb_cur_max > 1) -+ prepare_wc_buf (p, end); -+#endif -+ } - - /* Check if we've run off the end of the buffer. */ - if ((char *) p > end) --- -1.6.6.1 - diff --git a/grep-2.6.3-dfa-optimize-period.patch b/grep-2.6.3-dfa-optimize-period.patch deleted file mode 100644 index c83cc02..0000000 --- a/grep-2.6.3-dfa-optimize-period.patch +++ /dev/null @@ -1,165 +0,0 @@ -From 01422220ebf40f829c1f00418a96873b82f206ff Mon Sep 17 00:00:00 2001 -From: Paolo Bonzini -Date: Mon, 19 Apr 2010 14:50:23 +0200 -Subject: [PATCH 1/2] dfa: optimize UTF-8 period - -Backport of upstream commits 7a0ad00 and 42ac56a. - -* src/dfa.h (struct dfa): Add utf8_anychar_classes. -* src/dfa.c (add_utf8_anychar): New. -(atom): Simplify if/else nesting. Call add_utf8_anychar for ANYCHAR -in UTF-8 locales. -(dfaoptimize): Abort on ANYCHAR. ---- - src/dfa.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++--------- - src/dfa.h | 1 + - 2 files changed, 82 insertions(+), 14 deletions(-) - -diff --git a/src/dfa.c b/src/dfa.c -index ba78b08..e13c361 100644 ---- a/src/dfa.c -+++ b/src/dfa.c -@@ -1191,6 +1191,55 @@ addtok_wc (wint_t wc) - } - #endif - -+static void -+add_utf8_anychar (void) -+{ -+ static const charclass utf8_classes[5] = { -+ { 0, 0, 0, 0, ~0, ~0, 0, 0 }, /* 80-bf: non-lead bytes */ -+ { ~0, ~0, ~0, ~0, ~0, ~0, 0, 0xff000000 }, /* 00-bf, f8-ff: 1-byte/invalid */ -+ { 0, 0, 0, 0, 0, 0, ~0, 0 }, /* c0-df: 2-byte sequence */ -+ { 0, 0, 0, 0, 0, 0, 0, 0xffff }, /* e0-ef: 3-byte sequence */ -+ { 0, 0, 0, 0, 0, 0, 0, 0xff0000 } /* f0-f7: 4-byte sequence */ -+ }; -+ const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]); -+ unsigned int i; -+ -+ /* Define the five character classes that are needed below. */ -+ if (dfa->utf8_anychar_classes[0] == 0) -+ for (i = 0; i < n; i++) -+ { -+ charclass c; -+ memcpy (c, utf8_classes[i], sizeof c); -+ if (i == 1) -+ { -+ if (!(syntax_bits & RE_DOT_NEWLINE)) -+ clrbit (eolbyte, c); -+ if (syntax_bits & RE_DOT_NOT_NULL) -+ clrbit ('\0', c); -+ } -+ dfa->utf8_anychar_classes[i] = CSET + charclass_index(c); -+ } -+ -+ /* A valid UTF-8 character is -+ -+ ([0x00-0x7f] -+ |[0xc2-0xdf][0x80-0xbf] -+ |[0xe0-0xef[0x80-0xbf][0x80-0xbf] -+ |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf]) -+ -+ which I'll write more concisely "B|CA|DAA|EAAA". Factor the [0x80-0xbf] -+ and you get "B|(C|(D|EA)A)A". And since the token buffer is in reverse -+ Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR". */ -+ for (i = 1; i < n; i++) -+ addtok (dfa->utf8_anychar_classes[i]); -+ while (--i > 1) -+ { -+ addtok (dfa->utf8_anychar_classes[0]); -+ addtok (CAT); -+ addtok (OR); -+ } -+} -+ - /* The grammar understood by the parser is as follows. - - regexp: -@@ -1229,8 +1278,12 @@ addtok_wc (wint_t wc) - static void - atom (void) - { -+ if (0) -+ { -+ /* empty */ -+ } - #ifdef MBS_SUPPORT -- if (tok == WCHAR) -+ else if (tok == WCHAR) - { - addtok_wc (case_fold ? towlower(wctok) : wctok); - #ifndef GREP -@@ -1242,16 +1295,28 @@ atom (void) - #endif - - tok = lex(); -- return; -+ } -+ -+ else if (tok == ANYCHAR && using_utf8()) -+ { -+ /* For UTF-8 expand the period to a series of CSETs that define a valid -+ UTF-8 character. This avoids using the slow multibyte path. I'm -+ pretty sure it would be both profitable and correct to do it for -+ any encoding; however, the optimization must be done manually as -+ it is done above in add_utf8_anychar. So, let's start with -+ UTF-8: it is the most used, and the structure of the encoding -+ makes the correctness more obvious. */ -+ add_utf8_anychar(); -+ tok = lex(); - } - #endif /* MBS_SUPPORT */ - -- if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF -- || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD -+ else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF -+ || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD - #ifdef MBS_SUPPORT -- || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */ -+ || tok == ANYCHAR || tok == MBCSET - #endif /* MBS_SUPPORT */ -- || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) -+ || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) - { - addtok(tok); - tok = lex(); -@@ -3027,14 +3092,16 @@ dfaoptimize (struct dfa *d) - for (i = 0; i < d->tindex; ++i) - { - switch(d->tokens[i]) -- { -- case ANYCHAR: -- case MBCSET: -- /* Requires multi-byte algorithm. */ -- return; -- default: -- break; -- } -+ { -+ case ANYCHAR: -+ /* Lowered. */ -+ abort (); -+ case MBCSET: -+ /* Requires multi-byte algorithm. */ -+ return; -+ default: -+ break; -+ } - } - - free_mbdata (d); -diff --git a/src/dfa.h b/src/dfa.h -index 1c85207..42c177a 100644 ---- a/src/dfa.h -+++ b/src/dfa.h -@@ -283,6 +283,7 @@ struct dfa - with dfaparse(). */ - #ifdef MBS_SUPPORT - unsigned int mb_cur_max; /* Cached value of MB_CUR_MAX. */ -+ int utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */ - - /* The following are used only if MB_CUR_MAX > 1. */ - --- -1.6.6.1 - diff --git a/grep-2.6.3-dfa-speedup-digit-xdigit.patch b/grep-2.6.3-dfa-speedup-digit-xdigit.patch deleted file mode 100644 index 984c3c7..0000000 --- a/grep-2.6.3-dfa-speedup-digit-xdigit.patch +++ /dev/null @@ -1,114 +0,0 @@ -From ebca24d6c9eb12f91eed3993de65945ee97dd467 Mon Sep 17 00:00:00 2001 -From: Paolo Bonzini -Date: Tue, 4 May 2010 18:07:28 +0200 -Subject: [PATCH] dfa: speed up [[:digit:]] and [[:xdigit:]] - -There's no "multibyte pain" in these two classes, since POSIX -and ISO C99 mandate their contents. - -Time for "./grep -x '[[:digit:]]' /usr/share/dict/linux.words" -Before: 1.5s, after: 0.07s. (sed manages only 0.5s). - -* src/dfa.c (predicates): Declare struct dfa_ctype separately -from definition. Add sb_only. -(find_pred): Return const struct dfa_ctype *. -(parse_bracket_exp): Return const struct dfa_ctype *. Do -not fill MBCSET for sb_only character types. ---- - src/dfa.c | 55 ++++++++++++++++++++++++++++--------------------------- - 1 files changed, 28 insertions(+), 27 deletions(-) - -diff --git a/src/dfa.c b/src/dfa.c -index 4dd26c9..da5a306 100644 ---- a/src/dfa.c -+++ b/src/dfa.c -@@ -429,26 +429,29 @@ typedef int predicate (int); - /* The following list maps the names of the Posix named character classes - to predicate functions that determine whether a given character is in - the class. The leading [ has already been eaten by the lexical analyzer. */ --static struct { -+struct dfa_ctype { - const char *name; -- predicate *pred; --} const prednames[] = { -- { "alpha", is_alpha }, -- { "upper", is_upper }, -- { "lower", is_lower }, -- { "digit", is_digit }, -- { "xdigit", is_xdigit }, -- { "space", is_space }, -- { "punct", is_punct }, -- { "alnum", is_alnum }, -- { "print", is_print }, -- { "graph", is_graph }, -- { "cntrl", is_cntrl }, -- { "blank", is_blank }, -- { NULL, NULL } -+ predicate *func; -+ bool sb_only; - }; - --static predicate * -+static const struct dfa_ctype prednames[] = { -+ { "alpha", isalpha, false }, -+ { "upper", isupper, false }, -+ { "lower", islower, false }, -+ { "digit", isdigit, true }, -+ { "xdigit", isxdigit, true }, -+ { "space", isspace, false }, -+ { "punct", ispunct, false }, -+ { "alnum", isalnum, false }, -+ { "print", isprint, false }, -+ { "graph", isgraph, false }, -+ { "cntrl", iscntrl, false }, -+ { "blank", isblank, false }, -+ { NULL, NULL, false } -+}; -+ -+static const struct dfa_ctype * - find_pred (const char *str) - { - unsigned int i; -@@ -456,7 +459,7 @@ find_pred (const char *str) - if (!strcmp(str, prednames[i].name)) - break; - -- return prednames[i].pred; -+ return &prednames[i]; - } - - /* Multibyte character handling sub-routine for lex. -@@ -553,8 +556,11 @@ parse_bracket_exp (void) - || !strcmp (str, "lower")) - ? "alpha" - : str); -+ const struct dfa_ctype *pred = find_pred (class); -+ if (!pred) -+ dfaerror(_("invalid character class")); - #ifdef MBS_SUPPORT -- if (MB_CUR_MAX > 1) -+ if (MB_CUR_MAX > 1 && !pred->sb_only) - { - /* Store the character class as wctype_t. */ - wctype_t wt = wctype (class); -@@ -568,14 +574,9 @@ parse_bracket_exp (void) - } - #endif - -- { -- predicate *pred = find_pred (class); -- if (!pred) -- dfaerror(_("invalid character class")); -- for (c2 = 0; c2 < NOTCHAR; ++c2) -- if ((*pred)(c2)) -- setbit_case_fold (c2, ccl); -- } -+ for (c2 = 0; c2 < NOTCHAR; ++c2) -+ if (pred->func(c2)) -+ setbit_case_fold (c2, ccl); - } - - #ifdef MBS_SUPPORT --- -1.6.6.1 - diff --git a/grep-2.6.3-glibc-matcher-fallback.patch b/grep-2.6.3-glibc-matcher-fallback.patch deleted file mode 100644 index 609ee20..0000000 --- a/grep-2.6.3-glibc-matcher-fallback.patch +++ /dev/null @@ -1,38 +0,0 @@ -From 3fca11d78cfa1fec6199936d57871b9db08226ab Mon Sep 17 00:00:00 2001 -From: Paolo Bonzini -Date: Thu, 29 Apr 2010 17:13:32 +0200 -Subject: [PATCH 2/2] fall back to glibc matcher if a MBCSET is found - -This patch works around the performance problems of multibyte grep -upstream. - -For UTF-8 it should trigger only in the presence of MBCSET, e.g. [a-z]. - -For other character sets all brackets and `.` as well will trigger it. ---- - src/dfa.c | 9 +++++++++ - 1 files changed, 9 insertions(+), 0 deletions(-) - -diff --git a/src/dfa.c b/src/dfa.c -index e13c361..523fe05 100644 ---- a/src/dfa.c -+++ b/src/dfa.c -@@ -2927,6 +2927,15 @@ dfaexec (struct dfa *d, char const *begin, char *end, - continue; - } - -+ if (backref) -+ { -+ *backref = 1; -+ free(mblen_buf); -+ free(inputwcs); -+ *end = saved_end; -+ return (char *) p; -+ } -+ - /* Can match with a multibyte character (and multi character - collating element). Transition table might be updated. */ - s = transit_state(d, s, &p); --- -1.6.6.1 - diff --git a/grep-2.6.3-mmap-option-fix.patch b/grep-2.6.3-mmap-option-fix.patch deleted file mode 100644 index 6504dff..0000000 --- a/grep-2.6.3-mmap-option-fix.patch +++ /dev/null @@ -1,26 +0,0 @@ -From dfa2891e473abdb5507fff65002f946b60145f44 Mon Sep 17 00:00:00 2001 -From: Paolo Bonzini -Date: Tue, 20 Apr 2010 12:32:22 +0200 -Subject: [PATCH] grep: fix --mmap not being ignored - -* NEWS: Document bugfix. -* main.c (main): Ignore MMAP_OPTION. ---- - src/main.c | 1 + - 1 files changed, 1 insertions(+), 0 deletions(-) - -diff --git a/src/main.c b/src/main.c -index 1697c80..0ccf6eb 100644 ---- a/src/main.c -+++ b/src/main.c -@@ -2061,6 +2061,7 @@ main (int argc, char **argv) - label = optarg; - break; - -+ case MMAP_OPTION: - case 0: - /* long options */ - break; --- -1.6.6.1 - diff --git a/grep.spec b/grep.spec index 02aebf2..9464f9f 100644 --- a/grep.spec +++ b/grep.spec @@ -2,8 +2,8 @@ Summary: Pattern matching utilities Name: grep -Version: 2.6.3 -Release: 4%{?dist} +Version: 2.7 +Release: 1%{?dist} License: GPLv3+ Group: Applications/Text Source: ftp://ftp.gnu.org/pub/gnu/grep/grep-%{version}.tar.xz @@ -16,21 +16,6 @@ Requires(preun): /sbin/install-info BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX) BuildRequires: pcre-devel >= 3.9-10, texinfo, gettext BuildRequires: autoconf automake -# Speedups DFA UTF-8 period patterns. -# original name: 0001-dfa-optimize-UTF-8-period.patch -Patch0: grep-2.6.3-dfa-optimize-period.patch -# Speedups MBCSET ([a-z]) UTF-8 patterns. -# original name: 0002-fall-back-to-glibc-matcher-if-a-MBCSET-is-found.patch -Patch1: grep-2.6.3-glibc-matcher-fallback.patch -# Deprecated --mmap option is now properly ignored. -# 0003-grep-fix-mmap-not-being-ignored.patch -Patch2: grep-2.6.3-mmap-option-fix.patch -# Speedups -m and removes quadratic complexity when going to glibc. -# 0004-dfa-convert-to-wide-character-line-by-line.patch -Patch3: grep-2.6.3-dfa-convert-to-wide-char.patch -# Speedups DFA [[:digit:]] and [[:xdigit:]] patterns. -# 0005-dfa-speed-up-digit-and-xdigit.patch -Patch4: grep-2.6.3-dfa-speedup-digit-xdigit.patch %description The GNU versions of commonly used grep utilities. Grep searches through @@ -41,11 +26,6 @@ GNU grep is needed by many scripts, so it shall be installed on every system. %prep %setup -q -%patch0 -p1 -b .dfa-optimize-period -%patch1 -p1 -b .glibc-matcher-fallback -%patch2 -p1 -b .mmap-option-fix -%patch3 -p1 -b .dfa-convert-to-wide-char -%patch4 -p1 -b .speedup-digit-xdigit %build %configure --without-included-regex CPPFLAGS="-I%{_includedir}/pcre" @@ -87,6 +67,12 @@ fi %{_mandir}/*/* %changelog +* Tue Sep 21 2010 Jaroslav Škarvada - 2.7-1 +- New version: grep-2.7 +- Removed patches (already in upstream): dfa-optimize-period, + glibc-matcher-fallback, mmap-option-fix, dfa-convert-to-wide-char, + dfa-speedup-digit-xdigit + * Fri Jun 11 2010 Jaroslav Škarvada - 2.6.3-4 - Colors can be globally disabled via /etc/GREP_COLORS (#602867) - Fixed indentation in spec diff --git a/sources b/sources index bcbaed1..d84d2d0 100644 --- a/sources +++ b/sources @@ -1 +1 @@ -69a3bf508a3f14d12369e0e1c7a92763 grep-2.6.3.tar.xz +6dd9931a52501519d7779a27cf953326 grep-2.7.tar.xz