diff --git a/src/pcresearch.c b/src/pcresearch.c index 820dd00..11df488 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -136,34 +136,42 @@ Pexecute (char const *buf, size_t size, size_t *match_size, #else /* This array must have at least two elements; everything after that is just for performance improvement in pcre_exec. */ - int sub[300]; + enum { nsub = 300 }; + int sub[nsub]; - const char *line_buf, *line_end, *line_next; + char const *p = start_ptr ? start_ptr : buf; + int options = p == buf || p[-1] == eolbyte ? 0 : PCRE_NOTBOL; + char const *line_start = buf; int e = PCRE_ERROR_NOMATCH; - ptrdiff_t start_ofs = start_ptr ? start_ptr - buf : 0; + char const *line_end; /* PCRE can't limit the matching to single lines, therefore we have to match each line in the buffer separately. */ - for (line_next = buf; - e == PCRE_ERROR_NOMATCH && line_next < buf + size; - start_ofs -= line_next - line_buf) + for (; p < buf + size; p = line_start = line_end + 1) { - line_buf = line_next; - line_end = memchr (line_buf, eolbyte, (buf + size) - line_buf); - if (line_end == NULL) - line_next = line_end = buf + size; - else - line_next = line_end + 1; - - if (start_ptr && start_ptr >= line_end) - continue; + line_end = memchr (p, eolbyte, buf + size - p); - if (INT_MAX < line_end - line_buf) + if (INT_MAX < line_end - p) error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit")); - e = pcre_exec (cre, extra, line_buf, line_end - line_buf, - start_ofs < 0 ? 0 : start_ofs, 0, - sub, sizeof sub / sizeof *sub); + /* Treat encoding-error bytes as data that cannot match. */ + for (;;) + { + e = pcre_exec (cre, extra, p, line_end - p, 0, options, sub, nsub); + if (e != PCRE_ERROR_BADUTF8) + break; + e = pcre_exec (cre, extra, p, sub[0], 0, + options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL, + sub, nsub); + if (e != PCRE_ERROR_NOMATCH) + break; + p += sub[0] + 1; + options = PCRE_NOTBOL; + } + + if (e != PCRE_ERROR_NOMATCH) + break; + options = 0; } if (e <= 0) @@ -180,10 +188,6 @@ Pexecute (char const *buf, size_t size, size_t *match_size, error (EXIT_TROUBLE, 0, _("exceeded PCRE's backtracking limit")); - case PCRE_ERROR_BADUTF8: - error (EXIT_TROUBLE, 0, - _("invalid UTF-8 byte sequence in input")); - default: /* For now, we lump all remaining PCRE failures into this basket. If anyone cares to provide sample grep usage that can trigger @@ -197,25 +201,8 @@ Pexecute (char const *buf, size_t size, size_t *match_size, } else { - /* Narrow down to the line we've found. */ - char const *beg = line_buf + sub[0]; - char const *end = line_buf + sub[1]; - char const *buflim = buf + size; - char eol = eolbyte; - if (!start_ptr) - { - /* FIXME: The case when '\n' is not found indicates a bug: - Since grep is line oriented, the match should never contain - a newline, so there _must_ be a newline following. - */ - if (!(end = memchr (end, eol, buflim - end))) - end = buflim; - else - end++; - while (buf < beg && beg[-1] != eol) - --beg; - } - + char const *beg = start_ptr ? p + sub[0] : line_start; + char const *end = start_ptr ? p + sub[1] : line_end + 1; *match_size = end - beg; return beg - buf; } diff --git a/tests/pcre-infloop b/tests/pcre-infloop index 1b33e72..b92f8e1 100755 --- a/tests/pcre-infloop +++ b/tests/pcre-infloop @@ -28,6 +28,6 @@ printf 'a\201b\r' > in || framework_failure_ fail=0 LC_ALL=en_US.UTF-8 timeout 3 grep -P 'a.?..b' in -test $? = 2 || fail_ "libpcre's match function appears to infloop" +test $? = 1 || fail_ "libpcre's match function appears to infloop" Exit $fail diff --git a/tests/pcre-invalid-utf8-input b/tests/pcre-invalid-utf8-input index 913e8ee..9da4b18 100755 --- a/tests/pcre-invalid-utf8-input +++ b/tests/pcre-invalid-utf8-input @@ -13,9 +13,12 @@ require_en_utf8_locale_ fail=0 -printf 'j\202\nj\n' > in || framework_failure_ +printf 'j\202j\nj\nk\202\n' > in || framework_failure_ LC_ALL=en_US.UTF-8 grep -P j in -test $? -eq 2 || fail=1 +test $? -eq 0 || fail=1 + +LC_ALL=en_US.UTF-8 grep -P 'k$' in +test $? -eq 1 || fail=1 Exit $fail