grep/grep-2.6.3-dfa-convert-to-wide-char.patch

From ff191d4667709b52758fcc5bdc568726d1616be4 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <bonzini@gnu.org>
Date: Tue, 4 May 2010 17:26:09 +0200
Subject: [PATCH] dfa: convert to wide character line-by-line
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This provides a nice speedup for -m in general, but especially
it avoids quadratic complexity in case we have to go to glibc.

Testcases:

   # From upstream backref-multibyte-slow
   yes aba | sed 10000q > aba.txt
   time ./egrep -c '^([a-z]).\1$' aba.txt

   # From rbiba
   time grep '^[a-f][h-j][l-ž]$' cestina-sorted.txt

* src/dfa.c (prepare_wc_buf): Extract out of dfaexec.  Convert
only up to the next newline.
(dfaexec): Exit multibyte processing loop if past buf_end.
Call prepare_wc_buf again after processing a newline.
---
 src/dfa.c |   96 +++++++++++++++++++++++++++++++++++++-----------------------
 1 files changed, 59 insertions(+), 37 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 523fe05..70aa5a8 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -2824,6 +2824,53 @@ transit_state (struct dfa *d, int s, unsigned char const **pp)

 #endif /* MBS_SUPPORT */

+/* Initialize mblen_buf and inputwcs with data from the next line.  */
+
+static void
+prepare_wc_buf (const char *begin, const char *end)
+{
+  unsigned char eol = eolbyte;
+  size_t remain_bytes, i;
+
+  buf_begin = (unsigned char *) begin;
+
+  remain_bytes = 0;
+  for (i = 0; i < end - begin + 1; i++)
+    {
+      if (remain_bytes == 0)
+        {
+          remain_bytes
+            = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
+          if (remain_bytes < 1
+              || remain_bytes == (size_t) -1
+              || remain_bytes == (size_t) -2
+              || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
+            {
+              remain_bytes = 0;
+              inputwcs[i] = (wchar_t)begin[i];
+              mblen_buf[i] = 0;
+              if (begin[i] == eol)
+                break;
+            }
+          else
+            {
+              mblen_buf[i] = remain_bytes;
+              remain_bytes--;
+            }
+        }
+      else
+        {
+          mblen_buf[i] = remain_bytes;
+          inputwcs[i] = 0;
+          remain_bytes--;
+        }
+    }
+
+  buf_end = (unsigned char *) (begin + i);
+  mblen_buf[i] = 0;
+  inputwcs[i] = 0; /* sentinel */
+}
+
 /* Search through a buffer looking for a match to the given struct dfa.
    Find the first occurrence of a string matching the regexp in the
    buffer, and the shortest possible version thereof.  Return a pointer to
@@ -2870,43 +2917,10 @@ dfaexec (struct dfa *d, char const *begin, char *end,
 #ifdef MBS_SUPPORT
   if (d->mb_cur_max > 1)
     {
-      int remain_bytes, i;
-      buf_begin = (unsigned char *) begin;
-      buf_end = (unsigned char *) end;
-
-      /* initialize mblen_buf, and inputwcs.  */
       MALLOC(mblen_buf, unsigned char, end - begin + 2);
       MALLOC(inputwcs, wchar_t, end - begin + 2);
       memset(&mbs, 0, sizeof(mbstate_t));
-      remain_bytes = 0;
-      for (i = 0; i < end - begin + 1; i++)
-	{
-	  if (remain_bytes == 0)
-	    {
-	      remain_bytes
-		= mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
-	      if (remain_bytes < 1
-		|| (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
-		{
-		  remain_bytes = 0;
-		  inputwcs[i] = (wchar_t)begin[i];
-		  mblen_buf[i] = 0;
-		}
-	      else
-		{
-		  mblen_buf[i] = remain_bytes;
-		  remain_bytes--;
-		}
-	    }
-	  else
-	    {
-	      mblen_buf[i] = remain_bytes;
-	      inputwcs[i] = 0;
-	      remain_bytes--;
-	    }
-	}
-      mblen_buf[i] = 0;
-      inputwcs[i] = 0; /* sentinel */
+      prepare_wc_buf (p, end);
     }
 #endif /* MBS_SUPPORT */

@@ -2916,7 +2930,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
       if (d->mb_cur_max > 1)
 	while ((t = trans[s]))
 	  {
-	    if ((char *) p > end)
+	    if (p > buf_end)
 	      break;
 	    s1 = s;
 	    SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
@@ -2985,8 +2999,16 @@ dfaexec (struct dfa *d, char const *begin, char *end,
 	}

       /* If the previous character was a newline, count it. */
-      if (count && (char *) p <= end && p[-1] == eol)
-	++*count;
+      if ((char *) p <= end && p[-1] == eol)
+        {
+          if (count)
+            ++*count;
+
+#ifdef MBS_SUPPORT
+          if (d->mb_cur_max > 1)
+            prepare_wc_buf (p, end);
+#endif
+        }

       /* Check if we've run off the end of the buffer. */
       if ((char *) p > end)
--
1.6.6.1