- Further UTF-8 processing avoided since a '\n' byte is always an

end-of-line character in that encoding.
2004-12-14 16:21:41 +00:00 · 2004-12-14 16:21:41 +00:00 · c3fd6f2bbf
commit c3fd6f2bbf
parent 5b2463aa13
2 changed files with 102 additions and 44 deletions
--- a/grep-2.5.1-egf-speedup.patch
+++ b/grep-2.5.1-egf-speedup.patch
@ -1,16 +1,46 @@
--- grep-2.5.1/src/search.c.egf-speedup	2004-11-05 12:50:25.934736684 +0000
-+++ grep-2.5.1/src/search.c	2004-11-05 13:52:33.819394140 +0000
-@@ -70,9 +70,6 @@
+--- grep-2.5.1/src/search.c	2004-12-14 15:08:58.985159277 +0000
+++ grep-2.5.1/src/search.c	2004-12-14 15:55:21.257729918 +0000
+@@ -39,6 +39,9 @@
+ #ifdef HAVE_LIBPCRE
+ # include <pcre.h>
+ #endif
+#ifdef HAVE_LANGINFO_CODESET
+# include <langinfo.h>
+#endif
+ 
+ #define NCHAR (UCHAR_MAX + 1)
+ 
+@@ -70,9 +73,10 @@
    call the regexp matcher at all. */
 static int kwset_exact_matches;
 
 -#if defined(MBS_SUPPORT)
 -static char* check_multibyte_string PARAMS ((char const *buf, size_t size));
 -#endif
+/* UTF-8 encoding allows some optimizations that we can't otherwise
+   assume in a multibyte encoding. */
+static int using_utf8;
+
 static void kwsinit PARAMS ((void));
 static void kwsmusts PARAMS ((void));
 static void Gcompile PARAMS ((char const *, size_t));
-@@ -141,47 +138,6 @@
+@@ -84,6 +88,15 @@
+ static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int));
+ 
+ void
+check_utf8 (void)
+{
+#ifdef HAVE_LANGINFO_CODESET
+  if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0)
+    using_utf8 = 1;
+#endif
+}
+
+void
+ dfaerror (char const *mesg)
+ {
+   error (2, 0, mesg);
+@@ -141,47 +154,6 @@
     }
 }
 
@ -58,7 +88,23 @@
 static void
 Gcompile (char const *pattern, size_t size)
 {
-@@ -350,18 +306,8 @@
+@@ -190,6 +162,7 @@
+   size_t total = size;
+   char const *motif = pattern;
+ 
+  check_utf8 ();
+   re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE | (match_icase ? RE_ICASE : 0));
+   dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
+ 
+@@ -266,6 +239,7 @@
+   size_t total = size;
+   char const *motif = pattern;
+ 
+  check_utf8 ();
+   if (strcmp (matcher, "awk") == 0)
+     {
+       re_set_syntax (RE_SYNTAX_AWK | (match_icase ? RE_ICASE : 0));
+@@ -350,18 +324,8 @@
   struct kwsmatch kwsm;
   size_t i, ret_val;
 #ifdef MBS_SUPPORT
@ -79,7 +125,7 @@
 #endif /* MBS_SUPPORT */
 
   buflim = buf + size;
-@@ -373,18 +319,48 @@
+@@ -373,18 +337,48 @@
 	  if (kwset)
 	    {
 	      /* Find a possible match using the KWset matcher. */
@ -90,7 +136,7 @@
 	      if (offset == (size_t) -1)
 	        goto failure;
 +#ifdef MBS_SUPPORT
-+	      if (MB_CUR_MAX > 1)
+	      if (MB_CUR_MAX > 1 && !using_utf8)
 +		{
 +		  bytes_left = offset;
 +		  while (bytes_left)
@ -130,7 +176,7 @@
 	      while (beg > buf && beg[-1] != eol)
 		--beg;
 	      if (kwsm.index < kwset_exact_matches)
-@@ -395,13 +371,47 @@
+@@ -395,13 +389,47 @@
 	  else
 	    {
 	      /* No good fixed strings; start with DFA. */
@ -142,7 +188,7 @@
 		break;
 	      /* Narrow down to the line we've found. */
 +#ifdef MBS_SUPPORT
-+	      if (MB_CUR_MAX > 1)
+	      if (MB_CUR_MAX > 1 && !using_utf8)
 +		{
 +		  bytes_left = offset;
 +		  while (bytes_left)
@ -178,7 +224,7 @@
 	      while (beg > buf && beg[-1] != eol)
 		--beg;
 	    }
-@@ -469,15 +479,6 @@
+@@ -469,15 +497,6 @@
     } /* for (beg = end ..) */
 
  failure:
@ -194,7 +240,7 @@
   return (size_t) -1;
 
  success_in_beg_and_end:
-@@ -486,15 +487,6 @@
+@@ -486,15 +505,6 @@
   /* FALLTHROUGH */
 
  success_in_start_and_len:
@ -210,7 +256,15 @@
   *match_size = len;
   return start;
 }
-@@ -531,17 +523,8 @@
+@@ -504,6 +514,7 @@
+ {
+   char const *beg, *lim, *err;
+ 
+  check_utf8 ();
+   kwsinit ();
+   beg = pattern;
+   do
+@@ -531,17 +542,8 @@
   struct kwsmatch kwsmatch;
   size_t ret_val;
 #ifdef MBS_SUPPORT
@ -230,13 +284,13 @@
 #endif /* MBS_SUPPORT */
 
   for (beg = buf; beg <= buf + size; ++beg)
-@@ -550,8 +533,33 @@
+@@ -550,8 +552,33 @@
       if (offset == (size_t) -1)
 	goto failure;
 #ifdef MBS_SUPPORT
 -      if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0)
 -	continue; /* It is a part of multibyte character.  */
-+      if (MB_CUR_MAX > 1)
+      if (MB_CUR_MAX > 1 && !using_utf8)
 +	{
 +	  size_t bytes_left = offset;
 +	  while (bytes_left)
@ -266,12 +320,12 @@
 #endif /* MBS_SUPPORT */
       beg += offset;
       len = kwsmatch.size[0];
-@@ -587,6 +595,36 @@
+@@ -587,6 +614,36 @@
 	          if (offset == -1) {
 	            break; /* Try a different anchor. */
 	          }
 +#ifdef MBS_SUPPORT
-+		  if (MB_CUR_MAX > 1)
+		  if (MB_CUR_MAX > 1 && !using_utf8)
 +		    {
 +		      size_t bytes_left = offset;
 +		      while (bytes_left)
@ -303,47 +357,47 @@
 	          beg += offset;
 	          len = kwsmatch.size[0];
 	        }
-@@ -597,20 +635,30 @@
+@@ -597,19 +654,31 @@
     }
 
  failure:
-#ifdef MBS_SUPPORT
+  return -1;
+
+ success:
+ #ifdef MBS_SUPPORT
 -  if (MB_CUR_MAX > 1)
-    {
+  if (MB_CUR_MAX > 1 && !using_utf8)
+     {
 -      if (match_icase)
 -        free((char *) buf);
 -      if (mb_properties)
 -        free(mb_properties);
-    }
-#endif /* MBS_SUPPORT */
-   return -1;
- 
-  success:
-+#ifdef MBS_SUPPORT
-+  end = beg + len;
-+  while (end < buf + size)
-+    {
-+      size_t len = mbrlen (end, buf + size - end, &mbs);
-+      if (len == (size_t) -1 || len == (size_t) -2 || len == 0)
+      end = beg + len;
+      while (end < buf + size)
 +	{
-+	  memset (&mbs, '\0', sizeof (mbstate_t));
-+	  len = 1;
-+	}
-+      if (len == 1 && *end == eol)
-+	break;
+	  size_t len = mbrlen (end, buf + size - end, &mbs);
+	  if (len == (size_t) -1 || len == (size_t) -2 || len == 0)
+	    {
+	      memset (&mbs, '\0', sizeof (mbstate_t));
+	      len = 1;
+	    }
+	  if (len == 1 && *end == eol)
+	    break;
 +
-+      end += len;
-+    }
-+  end++;
-+#else
+	  end += len;
+	}
+     }
+  else
+ #endif /* MBS_SUPPORT */
+-  return -1;
+-
+- success:
   end = memchr (beg + len, eol, (buf + size) - (beg + len));
+
   end++;
-+#endif /* MBS_SUPPORT */
-+  /* Hmm, is this correct for multibyte? */
   while (buf < beg && beg[-1] != eol)
     --beg;
-   len = end - beg;
-@@ -618,15 +666,6 @@
+@@ -618,15 +687,6 @@
 
  success_in_beg_and_len:
   *match_size = len;
--- a/grep.spec
+++ b/grep.spec
@ -1,7 +1,7 @@
 Summary: The GNU versions of grep pattern matching utilities.
 Name: grep
 Version: 2.5.1
-Release: 41
+Release: 42
 License: GPL
 Group: Applications/Text
 Source: ftp://ftp.gnu.org/pub/gnu/grep/grep-%{version}.tar.bz2
@ -85,6 +85,10 @@ fi
 %{_mandir}/*/*

 %changelog
+* Tue Dec 14 2004 Tim Waugh <twaugh@redhat.com> 2.5.1-42
+- Further UTF-8 processing avoided since a '\n' byte is always an
+  end-of-line character in that encoding.
+
 * Fri Dec  3 2004 Tim Waugh <twaugh@redhat.com> 2.5.1-41
 - Fixed a busy loop in the egf-speedup patch (bug #140781).