From e7e16314f30e0c272c3c90d19ffda21b6793b323 Mon Sep 17 00:00:00 2001 From: Tim Waugh Date: Tue, 21 Dec 2004 13:57:42 +0000 Subject: [PATCH] - Fixed -Fi for multibyte input (bug #143079). --- grep-2.5.1-egf-speedup.patch | 264 ++++++++++++++++++++++++++++++++--- grep.spec | 5 +- 2 files changed, 249 insertions(+), 20 deletions(-) diff --git a/grep-2.5.1-egf-speedup.patch b/grep-2.5.1-egf-speedup.patch index 6d777de..88fdeea 100644 --- a/grep-2.5.1-egf-speedup.patch +++ b/grep-2.5.1-egf-speedup.patch @@ -1,6 +1,14 @@ ---- grep-2.5.1/src/search.c 2004-12-16 17:46:57.039678304 +0000 -+++ grep-2.5.1/src/search.c 2004-12-17 13:03:49.300731757 +0000 -@@ -39,6 +39,9 @@ +--- grep-2.5.1/src/search.c 2004-12-21 13:37:15.700555594 +0000 ++++ grep-2.5.1/src/search.c 2004-12-21 13:49:05.873811016 +0000 +@@ -21,6 +21,7 @@ + #ifdef HAVE_CONFIG_H + # include + #endif ++#include + #include + #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC + /* We can handle multibyte string. */ +@@ -39,6 +40,9 @@ #ifdef HAVE_LIBPCRE # include #endif @@ -10,7 +18,7 @@ #define NCHAR (UCHAR_MAX + 1) -@@ -70,9 +73,10 @@ +@@ -70,9 +74,10 @@ call the regexp matcher at all. */ static int kwset_exact_matches; @@ -24,7 +32,7 @@ static void kwsinit PARAMS ((void)); static void kwsmusts PARAMS ((void)); static void Gcompile PARAMS ((char const *, size_t)); -@@ -84,6 +88,15 @@ +@@ -84,6 +89,15 @@ static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int)); void @@ -40,7 +48,7 @@ dfaerror (char const *mesg) { error (2, 0, mesg); -@@ -141,47 +154,6 @@ +@@ -141,47 +155,6 @@ } } @@ -88,7 +96,7 @@ static void Gcompile (char const *pattern, size_t size) { -@@ -190,6 +162,7 @@ +@@ -190,6 +163,7 @@ size_t total = size; char const *motif = pattern; @@ -96,7 +104,7 @@ re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE | (match_icase ? RE_ICASE : 0)); dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte); -@@ -266,6 +239,7 @@ +@@ -266,6 +240,7 @@ size_t total = size; char const *motif = pattern; @@ -104,7 +112,7 @@ if (strcmp (matcher, "awk") == 0) { re_set_syntax (RE_SYNTAX_AWK | (match_icase ? RE_ICASE : 0)); -@@ -350,18 +324,8 @@ +@@ -350,18 +325,8 @@ struct kwsmatch kwsm; size_t i, ret_val; #ifdef MBS_SUPPORT @@ -125,7 +133,7 @@ #endif /* MBS_SUPPORT */ buflim = buf + size; -@@ -373,21 +337,63 @@ +@@ -373,21 +338,63 @@ if (kwset) { /* Find a possible match using the KWset matcher. */ @@ -193,7 +201,7 @@ goto success_in_beg_and_end; if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) continue; -@@ -395,13 +401,47 @@ +@@ -395,13 +402,47 @@ else { /* No good fixed strings; start with DFA. */ @@ -241,7 +249,7 @@ while (beg > buf && beg[-1] != eol) --beg; } -@@ -469,15 +509,6 @@ +@@ -469,15 +510,6 @@ } /* for (beg = end ..) */ failure: @@ -257,7 +265,7 @@ return (size_t) -1; success_in_beg_and_end: -@@ -486,15 +517,6 @@ +@@ -486,24 +518,125 @@ /* FALLTHROUGH */ success_in_start_and_len: @@ -273,15 +281,214 @@ *match_size = len; return start; } -@@ -504,6 +526,7 @@ + ++static wchar_t **f_pattern; ++static char *f_initial_byte; ++static size_t f_pattern_count; ++static int f_i_multibyte; /* whether we're using the new -Fi MB method */ ++ + static void + Fcompile (char const *pattern, size_t size) { char const *beg, *lim, *err; + check_utf8 (); ++#ifdef MBS_SUPPORT ++ /* Support -F -i for UTF-8 input. */ ++ if (match_icase && MB_CUR_MAX > 1) ++ { ++ size_t in = 0; ++ ++ while (f_i_multibyte != -1 && in < size) ++ { ++ wchar_t *f_this_pattern; ++ size_t f_this_pattern_allocated = sizeof (wchar_t) * 1000; ++ mbstate_t mbs; ++ size_t out = 0; ++ f_pattern_count++; ++ f_pattern = xrealloc (f_pattern, ++ sizeof (wchar_t *) * f_pattern_count); ++ f_initial_byte = xrealloc (f_initial_byte, ++ sizeof (char) * ++ (2 * f_pattern_count + 1)); ++ if (f_pattern_count == 1) ++ f_initial_byte[0] = '\0'; ++ ++ /* Convert pattern into wchar_t*, storing them in this_pattern. ++ Don't read more than we're given. */ ++ f_this_pattern = xmalloc (f_this_pattern_allocated); ++ memset (&mbs, '\0', sizeof (mbs)); ++ while (in < size) ++ { ++ size_t c; ++ wchar_t this_wc; ++ if (out == f_this_pattern_allocated) ++ { ++ f_this_pattern_allocated *= 2; ++ f_this_pattern = xrealloc (f_this_pattern, ++ f_this_pattern_allocated); ++ } ++ ++ c = mbrtowc (&this_wc, pattern + in, size - in, &mbs); ++ if (c < 1) ++ { ++ /* Fall back to old method. */ ++ f_i_multibyte = -1; ++ while (f_pattern_count--) ++ free (f_pattern[f_pattern_count]); ++ free (f_pattern); ++ f_pattern = NULL; ++ break; ++ } ++ ++ f_this_pattern[out] = towlower (this_wc); ++ if (out == 0) ++ { ++ /* First character. Work out the first byte of upper and ++ lower case multibyte strings for the first character. */ ++ wchar_t wc; ++ char mbs[MB_CUR_MAX]; ++ mbstate_t ps; ++ ++ if (iswupper (this_wc)) ++ { ++ wc = towlower (this_wc); ++ } ++ else ++ { ++ wc = towupper (this_wc); ++ } ++ ++ memset (&ps, '\0', sizeof (ps)); ++ wcrtomb (mbs, this_wc, &ps); ++ mbs[1] = '\0'; ++ strcat (f_initial_byte, mbs); ++ ++ memset (&ps, '\0', sizeof (ps)); ++ wcrtomb (mbs, wc, &ps); ++ mbs[1] = '\0'; ++ strcat (f_initial_byte, mbs); ++ } ++ ++ in += c; ++ ++ if (this_wc == L'\n') ++ break; ++ ++ out++; ++ } ++ ++ if (f_i_multibyte == -1) ++ break; ++ ++ /* Nul-terminate it. */ ++ if (out == f_this_pattern_allocated) ++ { ++ f_this_pattern_allocated++; ++ f_this_pattern = xrealloc (f_this_pattern, ++ f_this_pattern_allocated); ++ } ++ ++ f_this_pattern[out] = L'\0'; ++ f_pattern[f_pattern_count - 1] = f_this_pattern; ++ f_i_multibyte = 1; ++ } ++ } ++#endif /* MBS_SUPPORT */ ++ ++ kwsinit (); beg = pattern; do -@@ -531,17 +554,8 @@ +@@ -523,6 +656,87 @@ + } + + static size_t ++Fimbexec (const char *buf, size_t size, size_t *plen) ++{ ++ char const *beg; ++ size_t len; ++ mbstate_t mbs; ++ ++ assert (match_icase && f_i_multibyte == 1); ++ assert (MB_CUR_MAX > 1); ++ ++ memset (&mbs, '\0', sizeof (mbs)); ++ beg = buf; ++ len = 0; ++ while (beg < buf + size) ++ { ++ wchar_t wc; ++ char const *p; ++ char const *next_char; ++ unsigned char match[f_pattern_count]; ++ size_t i, letter; ++ int patterns_left; ++ ++ for (p = beg; ++ (p < buf + size) && !strchr (f_initial_byte, *p); ++ p++) ++ ; ++ ++ if (p == NULL || p == buf + size) ++ break; ++ ++ /* First byte matches, now check the rest */ ++ beg = p; ++ letter = len = 0; ++ memset (match, '\1', f_pattern_count); ++ patterns_left = 1; ++ while (patterns_left) ++ { ++ size_t c; ++ ++ patterns_left = 0; ++ ++ c = mbrtowc (&wc, beg + len, size - (beg - buf) - len, &mbs); ++ if (c < 1) ++ { ++ memset (&mbs, '\0', sizeof (mbs)); ++ next_char = beg + 1; ++ break; ++ } ++ ++ if (!len) ++ next_char = beg + c; ++ ++ wc = towlower (wc); ++ for (i = 0; i < f_pattern_count; i++) ++ { ++ if (match[i]) ++ { ++ if (f_pattern[i][letter] == L'\0') ++ { ++ /* Found a match. */ ++ *plen = len; ++ return beg - buf; ++ } ++ ++ if (f_pattern[i][letter] == wc) ++ patterns_left = 1; ++ else ++ match[i] = '\0'; ++ } ++ } ++ ++ len += c; ++ letter++; ++ } ++ ++ beg = next_char; ++ } ++ ++ return -1; ++} ++ ++static size_t + Fexecute (char const *buf, size_t size, size_t *match_size, int exact) + { + register char const *beg, *try, *end; +@@ -531,27 +745,50 @@ struct kwsmatch kwsmatch; size_t ret_val; #ifdef MBS_SUPPORT @@ -301,7 +508,16 @@ #endif /* MBS_SUPPORT */ for (beg = buf; beg <= buf + size; ++beg) -@@ -550,8 +564,33 @@ + { +- size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); ++ size_t offset; ++#ifdef MBS_SUPPORT ++ if (match_icase && f_i_multibyte == 1) ++ offset = Fimbexec (beg, buf + size - beg, &kwsmatch.size[0]); ++ else ++#endif /* MBS_SUPPORT */ ++ offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); ++ if (offset == (size_t) -1) goto failure; #ifdef MBS_SUPPORT @@ -337,7 +553,17 @@ #endif /* MBS_SUPPORT */ beg += offset; len = kwsmatch.size[0]; -@@ -587,6 +626,36 @@ +@@ -583,10 +820,46 @@ + { + /* Try a shorter length anchored at the same place. */ + --len; ++#ifdef MBS_SUPPORT ++ if (match_icase && f_i_multibyte == 1) ++ offset = Fimbexec (beg, len, &kwsmatch.size[0]); ++ else ++#endif /* MBS_SUPPORT */ + offset = kwsexec (kwset, beg, len, &kwsmatch); ++ if (offset == -1) { break; /* Try a different anchor. */ } @@ -374,7 +600,7 @@ beg += offset; len = kwsmatch.size[0]; } -@@ -597,19 +666,31 @@ +@@ -597,19 +870,31 @@ } failure: @@ -414,7 +640,7 @@ end++; while (buf < beg && beg[-1] != eol) --beg; -@@ -618,15 +699,6 @@ +@@ -618,15 +903,6 @@ success_in_beg_and_len: *match_size = len; diff --git a/grep.spec b/grep.spec index efb51e3..d678e18 100644 --- a/grep.spec +++ b/grep.spec @@ -1,7 +1,7 @@ Summary: The GNU versions of grep pattern matching utilities. Name: grep Version: 2.5.1 -Release: 43 +Release: 44 License: GPL Group: Applications/Text Source: ftp://ftp.gnu.org/pub/gnu/grep/grep-%{version}.tar.bz2 @@ -85,6 +85,9 @@ fi %{_mandir}/*/* %changelog +* Tue Dec 21 2004 Tim Waugh 2.5.1-44 +- Fixed -Fi for multibyte input (bug #143079). + * Thu Dec 16 2004 Tim Waugh 2.5.1-43 - Bypass kwset matching when ignoring case and processing multibyte input (bug #143079).