grep/grep-2.6.3-dfa-convert-to-wide-char.patch
Jaroslav Škarvada 6ccb1073e9 - Added dfa-optimize-period patch (speedup for . patterns in UTF-8)
- Added glibc-matcher-fallback patch (speedup for [a-z] patterns in UTF-8)
- Added mmap-option-fix patch
- Added dfa-convert-to-wide-char patch (speedup for -m and remove quadratic
    complexity when going to glibc)
- Added dfa-speedup-digit-xdigit patch (speedup for [[:digit:]]
    [:xdigit:]])
2010-05-07 11:34:53 +00:00

163 lines
4.5 KiB
Diff

From ff191d4667709b52758fcc5bdc568726d1616be4 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <bonzini@gnu.org>
Date: Tue, 4 May 2010 17:26:09 +0200
Subject: [PATCH] dfa: convert to wide character line-by-line
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This provides a nice speedup for -m in general, but especially
it avoids quadratic complexity in case we have to go to glibc.
Testcases:
# From upstream backref-multibyte-slow
yes aba | sed 10000q > aba.txt
time ./egrep -c '^([a-z]).\1$' aba.txt
# From rbiba
time grep '^[a-f][h-j][l-ž]$' cestina-sorted.txt
* src/dfa.c (prepare_wc_buf): Extract out of dfaexec. Convert
only up to the next newline.
(dfaexec): Exit multibyte processing loop if past buf_end.
Call prepare_wc_buf again after processing a newline.
---
src/dfa.c | 96 +++++++++++++++++++++++++++++++++++++-----------------------
1 files changed, 59 insertions(+), 37 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index 523fe05..70aa5a8 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -2824,6 +2824,53 @@ transit_state (struct dfa *d, int s, unsigned char const **pp)
#endif /* MBS_SUPPORT */
+/* Initialize mblen_buf and inputwcs with data from the next line. */
+
+static void
+prepare_wc_buf (const char *begin, const char *end)
+{
+ unsigned char eol = eolbyte;
+ size_t remain_bytes, i;
+
+ buf_begin = (unsigned char *) begin;
+
+ remain_bytes = 0;
+ for (i = 0; i < end - begin + 1; i++)
+ {
+ if (remain_bytes == 0)
+ {
+ remain_bytes
+ = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
+ if (remain_bytes < 1
+ || remain_bytes == (size_t) -1
+ || remain_bytes == (size_t) -2
+ || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
+ {
+ remain_bytes = 0;
+ inputwcs[i] = (wchar_t)begin[i];
+ mblen_buf[i] = 0;
+ if (begin[i] == eol)
+ break;
+ }
+ else
+ {
+ mblen_buf[i] = remain_bytes;
+ remain_bytes--;
+ }
+ }
+ else
+ {
+ mblen_buf[i] = remain_bytes;
+ inputwcs[i] = 0;
+ remain_bytes--;
+ }
+ }
+
+ buf_end = (unsigned char *) (begin + i);
+ mblen_buf[i] = 0;
+ inputwcs[i] = 0; /* sentinel */
+}
+
/* Search through a buffer looking for a match to the given struct dfa.
Find the first occurrence of a string matching the regexp in the
buffer, and the shortest possible version thereof. Return a pointer to
@@ -2870,43 +2917,10 @@ dfaexec (struct dfa *d, char const *begin, char *end,
#ifdef MBS_SUPPORT
if (d->mb_cur_max > 1)
{
- int remain_bytes, i;
- buf_begin = (unsigned char *) begin;
- buf_end = (unsigned char *) end;
-
- /* initialize mblen_buf, and inputwcs. */
MALLOC(mblen_buf, unsigned char, end - begin + 2);
MALLOC(inputwcs, wchar_t, end - begin + 2);
memset(&mbs, 0, sizeof(mbstate_t));
- remain_bytes = 0;
- for (i = 0; i < end - begin + 1; i++)
- {
- if (remain_bytes == 0)
- {
- remain_bytes
- = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
- if (remain_bytes < 1
- || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
- {
- remain_bytes = 0;
- inputwcs[i] = (wchar_t)begin[i];
- mblen_buf[i] = 0;
- }
- else
- {
- mblen_buf[i] = remain_bytes;
- remain_bytes--;
- }
- }
- else
- {
- mblen_buf[i] = remain_bytes;
- inputwcs[i] = 0;
- remain_bytes--;
- }
- }
- mblen_buf[i] = 0;
- inputwcs[i] = 0; /* sentinel */
+ prepare_wc_buf (p, end);
}
#endif /* MBS_SUPPORT */
@@ -2916,7 +2930,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
if (d->mb_cur_max > 1)
while ((t = trans[s]))
{
- if ((char *) p > end)
+ if (p > buf_end)
break;
s1 = s;
SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
@@ -2985,8 +2999,16 @@ dfaexec (struct dfa *d, char const *begin, char *end,
}
/* If the previous character was a newline, count it. */
- if (count && (char *) p <= end && p[-1] == eol)
- ++*count;
+ if ((char *) p <= end && p[-1] == eol)
+ {
+ if (count)
+ ++*count;
+
+#ifdef MBS_SUPPORT
+ if (d->mb_cur_max > 1)
+ prepare_wc_buf (p, end);
+#endif
+ }
/* Check if we've run off the end of the buffer. */
if ((char *) p > end)
--
1.6.6.1