163 lines
4.5 KiB
Diff
163 lines
4.5 KiB
Diff
|
From ff191d4667709b52758fcc5bdc568726d1616be4 Mon Sep 17 00:00:00 2001
|
||
|
From: Paolo Bonzini <bonzini@gnu.org>
|
||
|
Date: Tue, 4 May 2010 17:26:09 +0200
|
||
|
Subject: [PATCH] dfa: convert to wide character line-by-line
|
||
|
MIME-Version: 1.0
|
||
|
Content-Type: text/plain; charset=UTF-8
|
||
|
Content-Transfer-Encoding: 8bit
|
||
|
|
||
|
This provides a nice speedup for -m in general, but especially
|
||
|
it avoids quadratic complexity in case we have to go to glibc.
|
||
|
|
||
|
Testcases:
|
||
|
|
||
|
# From upstream backref-multibyte-slow
|
||
|
yes aba | sed 10000q > aba.txt
|
||
|
time ./egrep -c '^([a-z]).\1$' aba.txt
|
||
|
|
||
|
# From rbiba
|
||
|
time grep '^[a-f][h-j][l-ž]$' cestina-sorted.txt
|
||
|
|
||
|
* src/dfa.c (prepare_wc_buf): Extract out of dfaexec. Convert
|
||
|
only up to the next newline.
|
||
|
(dfaexec): Exit multibyte processing loop if past buf_end.
|
||
|
Call prepare_wc_buf again after processing a newline.
|
||
|
---
|
||
|
src/dfa.c | 96 +++++++++++++++++++++++++++++++++++++-----------------------
|
||
|
1 files changed, 59 insertions(+), 37 deletions(-)
|
||
|
|
||
|
diff --git a/src/dfa.c b/src/dfa.c
|
||
|
index 523fe05..70aa5a8 100644
|
||
|
--- a/src/dfa.c
|
||
|
+++ b/src/dfa.c
|
||
|
@@ -2824,6 +2824,53 @@ transit_state (struct dfa *d, int s, unsigned char const **pp)
|
||
|
|
||
|
#endif /* MBS_SUPPORT */
|
||
|
|
||
|
+/* Initialize mblen_buf and inputwcs with data from the next line. */
|
||
|
+
|
||
|
+static void
|
||
|
+prepare_wc_buf (const char *begin, const char *end)
|
||
|
+{
|
||
|
+ unsigned char eol = eolbyte;
|
||
|
+ size_t remain_bytes, i;
|
||
|
+
|
||
|
+ buf_begin = (unsigned char *) begin;
|
||
|
+
|
||
|
+ remain_bytes = 0;
|
||
|
+ for (i = 0; i < end - begin + 1; i++)
|
||
|
+ {
|
||
|
+ if (remain_bytes == 0)
|
||
|
+ {
|
||
|
+ remain_bytes
|
||
|
+ = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
|
||
|
+ if (remain_bytes < 1
|
||
|
+ || remain_bytes == (size_t) -1
|
||
|
+ || remain_bytes == (size_t) -2
|
||
|
+ || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
|
||
|
+ {
|
||
|
+ remain_bytes = 0;
|
||
|
+ inputwcs[i] = (wchar_t)begin[i];
|
||
|
+ mblen_buf[i] = 0;
|
||
|
+ if (begin[i] == eol)
|
||
|
+ break;
|
||
|
+ }
|
||
|
+ else
|
||
|
+ {
|
||
|
+ mblen_buf[i] = remain_bytes;
|
||
|
+ remain_bytes--;
|
||
|
+ }
|
||
|
+ }
|
||
|
+ else
|
||
|
+ {
|
||
|
+ mblen_buf[i] = remain_bytes;
|
||
|
+ inputwcs[i] = 0;
|
||
|
+ remain_bytes--;
|
||
|
+ }
|
||
|
+ }
|
||
|
+
|
||
|
+ buf_end = (unsigned char *) (begin + i);
|
||
|
+ mblen_buf[i] = 0;
|
||
|
+ inputwcs[i] = 0; /* sentinel */
|
||
|
+}
|
||
|
+
|
||
|
/* Search through a buffer looking for a match to the given struct dfa.
|
||
|
Find the first occurrence of a string matching the regexp in the
|
||
|
buffer, and the shortest possible version thereof. Return a pointer to
|
||
|
@@ -2870,43 +2917,10 @@ dfaexec (struct dfa *d, char const *begin, char *end,
|
||
|
#ifdef MBS_SUPPORT
|
||
|
if (d->mb_cur_max > 1)
|
||
|
{
|
||
|
- int remain_bytes, i;
|
||
|
- buf_begin = (unsigned char *) begin;
|
||
|
- buf_end = (unsigned char *) end;
|
||
|
-
|
||
|
- /* initialize mblen_buf, and inputwcs. */
|
||
|
MALLOC(mblen_buf, unsigned char, end - begin + 2);
|
||
|
MALLOC(inputwcs, wchar_t, end - begin + 2);
|
||
|
memset(&mbs, 0, sizeof(mbstate_t));
|
||
|
- remain_bytes = 0;
|
||
|
- for (i = 0; i < end - begin + 1; i++)
|
||
|
- {
|
||
|
- if (remain_bytes == 0)
|
||
|
- {
|
||
|
- remain_bytes
|
||
|
- = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
|
||
|
- if (remain_bytes < 1
|
||
|
- || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
|
||
|
- {
|
||
|
- remain_bytes = 0;
|
||
|
- inputwcs[i] = (wchar_t)begin[i];
|
||
|
- mblen_buf[i] = 0;
|
||
|
- }
|
||
|
- else
|
||
|
- {
|
||
|
- mblen_buf[i] = remain_bytes;
|
||
|
- remain_bytes--;
|
||
|
- }
|
||
|
- }
|
||
|
- else
|
||
|
- {
|
||
|
- mblen_buf[i] = remain_bytes;
|
||
|
- inputwcs[i] = 0;
|
||
|
- remain_bytes--;
|
||
|
- }
|
||
|
- }
|
||
|
- mblen_buf[i] = 0;
|
||
|
- inputwcs[i] = 0; /* sentinel */
|
||
|
+ prepare_wc_buf (p, end);
|
||
|
}
|
||
|
#endif /* MBS_SUPPORT */
|
||
|
|
||
|
@@ -2916,7 +2930,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
|
||
|
if (d->mb_cur_max > 1)
|
||
|
while ((t = trans[s]))
|
||
|
{
|
||
|
- if ((char *) p > end)
|
||
|
+ if (p > buf_end)
|
||
|
break;
|
||
|
s1 = s;
|
||
|
SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
|
||
|
@@ -2985,8 +2999,16 @@ dfaexec (struct dfa *d, char const *begin, char *end,
|
||
|
}
|
||
|
|
||
|
/* If the previous character was a newline, count it. */
|
||
|
- if (count && (char *) p <= end && p[-1] == eol)
|
||
|
- ++*count;
|
||
|
+ if ((char *) p <= end && p[-1] == eol)
|
||
|
+ {
|
||
|
+ if (count)
|
||
|
+ ++*count;
|
||
|
+
|
||
|
+#ifdef MBS_SUPPORT
|
||
|
+ if (d->mb_cur_max > 1)
|
||
|
+ prepare_wc_buf (p, end);
|
||
|
+#endif
|
||
|
+ }
|
||
|
|
||
|
/* Check if we've run off the end of the buffer. */
|
||
|
if ((char *) p > end)
|
||
|
--
|
||
|
1.6.6.1
|
||
|
|