From e1ca9413e82e97466724a435578522ff4f1b28c2 Mon Sep 17 00:00:00 2001 From: Tim Waugh Date: Wed, 3 Nov 2004 17:42:00 +0000 Subject: [PATCH] - Remove mb-caching hack. - Better multibyte handling in EGexecute(). - Don't need regex.c changes in grep-2.5-i18n.patch. --- grep-2.5-i18n.patch | 43 ----- grep-2.5.1-egf-speedup.patch | 348 +++++++++++++++++++++++++++++++++++ grep.spec | 11 +- 3 files changed, 356 insertions(+), 46 deletions(-) create mode 100644 grep-2.5.1-egf-speedup.patch diff --git a/grep-2.5-i18n.patch b/grep-2.5-i18n.patch index bda0cd9..8dc3dfe 100644 --- a/grep-2.5-i18n.patch +++ b/grep-2.5-i18n.patch @@ -1,46 +1,3 @@ ---- grep-2.5.1/lib/regex.c.i18n 2001-04-02 19:04:45.000000000 +0100 -+++ grep-2.5.1/lib/regex.c 2004-02-26 13:09:54.000000000 +0000 -@@ -60,6 +60,10 @@ - #ifdef MBS_SUPPORT - # define CHAR_TYPE wchar_t - # define US_CHAR_TYPE wchar_t/* unsigned character type */ -+# define CHAR_T_SIGN (1 << (sizeof(CHAR_TYPE) * 8 - 1)) -+# if defined _AIX -+# define WCHAR_T_NEED_SIGNEXTEND 1 -+# endif /* _AIX */ - # define COMPILED_BUFFER_VAR wc_buffer - # define OFFSET_ADDRESS_SIZE 1 /* the size which STORE_NUMBER macro use */ - # define CHAR_CLASS_SIZE ((__alignof__(wctype_t)+sizeof(wctype_t))/sizeof(CHAR_TYPE)+1) -@@ -618,10 +622,13 @@ - /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */ - - #ifdef MBS_SUPPORT --# define EXTRACT_NUMBER(destination, source) \ -- do { \ -- (destination) = *(source); \ -- } while (0) -+# ifdef WCHAR_T_NEED_SIGNEXTEND -+# define EXTRACT_NUMBER(destination, source) \ -+ (destination) = (*(source) ^ CHAR_T_SIGN) - CHAR_T_SIGN; -+# else -+# define EXTRACT_NUMBER(destination, source) \ -+ (destination) = *(source) -+# endif /* WCHAR_T_NEED_SIGNEXTEND */ - #else - # define EXTRACT_NUMBER(destination, source) \ - do { \ -@@ -638,7 +645,11 @@ - US_CHAR_TYPE *source; - { - #ifdef MBS_SUPPORT -+# ifdef WCHAR_T_NEED_SIGNEXTEND -+ *dest = (*source ^ CHAR_T_SIGN) - CHAR_T_SIGN; -+# else - *dest = *source; -+# endif /* WCHAR_T_NEED_SIGNEXTEND */ - #else - int temp = SIGN_EXTEND_CHAR (*(source + 1)); - *dest = *source & 0377; --- grep-2.5.1/src/dfa.c 2004-02-26 13:09:54.000000000 +0000 +++ grep-2.5.1/src/dfa.c 2004-05-18 16:43:31.189200479 +0100 @@ -414,7 +414,7 @@ diff --git a/grep-2.5.1-egf-speedup.patch b/grep-2.5.1-egf-speedup.patch new file mode 100644 index 0000000..0578a8c --- /dev/null +++ b/grep-2.5.1-egf-speedup.patch @@ -0,0 +1,348 @@ +--- grep-2.5.1/src/search.c.egf-speedup 2004-11-03 17:38:36.338557746 +0000 ++++ grep-2.5.1/src/search.c 2004-11-03 17:39:51.853925940 +0000 +@@ -70,9 +70,6 @@ + call the regexp matcher at all. */ + static int kwset_exact_matches; + +-#if defined(MBS_SUPPORT) +-static char* check_multibyte_string PARAMS ((char const *buf, size_t size)); +-#endif + static void kwsinit PARAMS ((void)); + static void kwsmusts PARAMS ((void)); + static void Gcompile PARAMS ((char const *, size_t)); +@@ -141,47 +138,6 @@ + } + } + +-#ifdef MBS_SUPPORT +-/* This function allocate the array which correspond to "buf". +- Then this check multibyte string and mark on the positions which +- are not singlebyte character nor the first byte of a multibyte +- character. Caller must free the array. */ +-static char* +-check_multibyte_string(char const *buf, size_t size) +-{ +- char *mb_properties = xmalloc(size); +- mbstate_t cur_state; +- wchar_t wc; +- int i; +- memset(&cur_state, 0, sizeof(mbstate_t)); +- memset(mb_properties, 0, sizeof(char)*size); +- for (i = 0; i < size ;) +- { +- size_t mbclen; +- mbclen = mbrtowc(&wc, buf + i, size - i, &cur_state); +- +- if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) +- { +- /* An invalid sequence, or a truncated multibyte character. +- We treat it as a singlebyte character. */ +- mbclen = 1; +- } +- else if (match_icase) +- { +- if (iswupper((wint_t)wc)) +- { +- wc = towlower((wint_t)wc); +- wcrtomb(buf + i, wc, &cur_state); +- } +- } +- mb_properties[i] = mbclen; +- i += mbclen; +- } +- +- return mb_properties; +-} +-#endif +- + static void + Gcompile (char const *pattern, size_t size) + { +@@ -350,18 +306,9 @@ + struct kwsmatch kwsm; + size_t i, ret_val; + #ifdef MBS_SUPPORT +- char *mb_properties = NULL; +- if (MB_CUR_MAX > 1) +- { +- if (match_icase) +- { +- char *case_buf = xmalloc(size); +- memcpy(case_buf, buf, size); +- buf = case_buf; +- } +- if (kwset) +- mb_properties = check_multibyte_string(buf, size); +- } ++ size_t n; ++ mbstate_t mbs; ++ memset (&mbs, '\0', sizeof (mbstate_t)); + #endif /* MBS_SUPPORT */ + + buflim = buf + size; +@@ -376,15 +323,50 @@ + size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm); + if (offset == (size_t) -1) + goto failure; ++#ifdef MBS_SUPPORT ++ n = offset; ++ while (n) ++ { ++ size_t len = mbrlen (beg, n, &mbs); ++ if (len == (size_t) -1 || len == (size_t) -2 || len == 0) ++ { ++ /* Incomplete character. */ ++ memset (&mbs, '\0', sizeof (mbstate_t)); ++ beg += n; ++ break; ++ } ++ ++ beg += len; ++ n -= len; ++ } ++ if (n) ++ continue; ++ ++ /* Narrow down to the line containing the candidate, and ++ run it through DFA. */ ++ end = beg; ++ while (end < buflim) ++ { ++ size_t len = mbrlen (end, buflim - end, &mbs); ++ if (len == (size_t) -1 || len == (size_t) -2 || len == 0) ++ { ++ memset (&mbs, '\0', sizeof (mbstate_t)); ++ continue; ++ } ++ if (len == 1 && *end == eol) ++ break; ++ ++ end += len; ++ } ++ end++; ++#else + beg += offset; + /* Narrow down to the line containing the candidate, and + run it through DFA. */ + end = memchr(beg, eol, buflim - beg); + end++; +-#ifdef MBS_SUPPORT +- if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0) +- continue; +-#endif ++#endif /* MBS_SUPPORT */ ++ /* Hmm, is this correct for multibyte? */ + while (beg > buf && beg[-1] != eol) + --beg; + if (kwsm.index < kwset_exact_matches) +@@ -399,9 +381,44 @@ + if (offset == (size_t) -1) + break; + /* Narrow down to the line we've found. */ ++#ifdef MBS_SUPPORT ++ n = offset; ++ while (n) ++ { ++ size_t len = mbrlen (beg, n, &mbs); ++ if (len == (size_t) -1 || len == (size_t) -2 || len == 0) ++ { ++ /* Incomplete character. */ ++ memset (&mbs, '\0', sizeof (mbstate_t)); ++ beg += n; ++ continue; ++ } ++ ++ beg += len; ++ n -= len; ++ } ++ end = beg; ++ while (end < buflim) ++ { ++ size_t len = mbrlen (end, buflim - end, &mbs); ++ if (len == (size_t) -1 || len == (size_t) -2 || len == 0) ++ { ++ memset (&mbs, '\0', sizeof (mbstate_t)); ++ continue; ++ } ++ ++ if (len == 1 && *end == eol) ++ break; ++ ++ end += len; ++ } ++ end++; ++#else + beg += offset; + end = memchr (beg, eol, buflim - beg); + end++; ++#endif /* MBS_SUPPORT */ ++ /* Hmm, is this correct for multibyte? */ + while (beg > buf && beg[-1] != eol) + --beg; + } +@@ -469,15 +486,6 @@ + } /* for (beg = end ..) */ + + failure: +-#ifdef MBS_SUPPORT +- if (MB_CUR_MAX > 1) +- { +- if (mb_properties) +- free (mb_properties); +- if (match_icase) +- free ((char *) buf); +- } +-#endif /* MBS_SUPPORT */ + return (size_t) -1; + + success_in_beg_and_end: +@@ -486,15 +494,6 @@ + /* FALLTHROUGH */ + + success_in_start_and_len: +-#ifdef MBS_SUPPORT +- if (MB_CUR_MAX > 1) +- { +- if (mb_properties) +- free (mb_properties); +- if (match_icase) +- free ((char *) buf); +- } +-#endif /* MBS_SUPPORT */ + *match_size = len; + return start; + } +@@ -531,29 +530,37 @@ + struct kwsmatch kwsmatch; + size_t ret_val; + #ifdef MBS_SUPPORT +- char *mb_properties = NULL; +- if (MB_CUR_MAX > 1) +- { +- if (match_icase) +- { +- char *case_buf = xmalloc(size); +- memcpy(case_buf, buf, size); +- buf = case_buf; +- } +- mb_properties = check_multibyte_string(buf, size); +- } ++ mbstate_t mbs; ++ memset (&mbs, '\0', sizeof (mbstate_t)); + #endif /* MBS_SUPPORT */ + + for (beg = buf; beg <= buf + size; ++beg) + { ++#ifdef MBS_SUPPORT ++ size_t n; ++#endif /* MBS_SUPPORT */ + size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); + if (offset == (size_t) -1) + goto failure; + #ifdef MBS_SUPPORT +- if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0) +- continue; /* It is a part of multibyte character. */ +-#endif /* MBS_SUPPORT */ ++ n = offset; ++ while (n) ++ { ++ size_t len = mbrlen (beg, n, &mbs); ++ if (len == (size_t) -1 || len == (size_t) -2 || len == 0) ++ { ++ /* Incomplete character. */ ++ memset (&mbs, '\0', sizeof (mbstate_t)); ++ beg += n; ++ continue; ++ } ++ ++ beg += len; ++ n -= len; ++ } ++#else + beg += offset; ++#endif /* MBS_SUPPORT */ + len = kwsmatch.size[0]; + if (exact && !match_words) + goto success_in_beg_and_len; +@@ -587,7 +594,25 @@ + if (offset == -1) { + break; /* Try a different anchor. */ + } ++#ifdef MBS_SUPPORT ++ n = offset; ++ while (n) ++ { ++ size_t len = mbrlen (beg, n, &mbs); ++ if (len == (size_t) -1 || len == (size_t) -2 || len == 0) ++ { ++ /* Incomplete character. */ ++ memset (&mbs, '\0', sizeof (mbstate_t)); ++ beg += n; ++ continue; ++ } ++ ++ beg += len; ++ n -= len; ++ } ++#else + beg += offset; ++#endif /* MBS_SUPPORT */ + len = kwsmatch.size[0]; + } + } +@@ -597,20 +622,30 @@ + } + + failure: +-#ifdef MBS_SUPPORT +- if (MB_CUR_MAX > 1) +- { +- if (match_icase) +- free((char *) buf); +- if (mb_properties) +- free(mb_properties); +- } +-#endif /* MBS_SUPPORT */ + return -1; + + success: ++#ifdef MBS_SUPPORT ++ end = beg + len; ++ while (end < buf + size) ++ { ++ size_t len = mbrlen (end, buf + size - end, &mbs); ++ if (len == (size_t) -1 || len == (size_t) -2 || len == 0) ++ { ++ memset (&mbs, '\0', sizeof (mbstate_t)); ++ continue; ++ } ++ if (len == 1 && *end == eol) ++ break; ++ ++ end += len; ++ } ++ end++; ++#else + end = memchr (beg + len, eol, (buf + size) - (beg + len)); + end++; ++#endif /* MBS_SUPPORT */ ++ /* Hmm, is this correct for multibyte? */ + while (buf < beg && beg[-1] != eol) + --beg; + len = end - beg; +@@ -618,15 +653,6 @@ + + success_in_beg_and_len: + *match_size = len; +-#ifdef MBS_SUPPORT +- if (MB_CUR_MAX > 1) +- { +- if (mb_properties) +- free (mb_properties); +- if (match_icase) +- free ((char *) buf); +- } +-#endif /* MBS_SUPPORT */ + return beg - buf; + } + diff --git a/grep.spec b/grep.spec index 8381343..1e3918a 100644 --- a/grep.spec +++ b/grep.spec @@ -1,7 +1,7 @@ Summary: The GNU versions of grep pattern matching utilities. Name: grep Version: 2.5.1 -Release: 31 +Release: 32 License: GPL Group: Applications/Text Source: ftp://ftp.gnu.org/pub/gnu/grep/grep-%{version}.tar.bz2 @@ -10,7 +10,7 @@ Patch1: grep-2.5.1-bracket.patch Patch2: grep-2.5-i18n.patch Patch3: grep-2.5.1-oi.patch Patch4: grep-2.5.1-manpage.patch -Patch5: grep-2.5.1-gofast.patch +Patch5: grep-2.5.1-egf-speedup.patch URL: http://www.gnu.org/software/grep/ Prereq: /sbin/install-info Buildroot: %{_tmppath}/%{name}-%{version}-root @@ -33,7 +33,7 @@ utility for searching through text. %patch2 -p1 -b .i18n %patch3 -p1 -b .oi %patch4 -p1 -b .manpage -%patch5 -p1 -b .gofast +%patch5 -p1 -b .egf-speedup %build [ ! -e configure ] && ./autogen.sh @@ -81,6 +81,11 @@ fi %{_mandir}/*/* %changelog +* Wed Nov 3 2004 Tim Waugh 2.5.1-32 +- Remove mb-caching hack. +- Better multibyte handling in EGexecute(). +- Don't need regex.c changes in grep-2.5-i18n.patch. + * Wed Oct 13 2004 Tim Waugh 2.5.1-31 - Make 'grep -F' avoid UTF-8 processing if the pattern contains no multibyte characters (bug #133932).