vim/7.3.1011
2013-06-04 12:05:56 +02:00

476 lines
12 KiB
Plaintext
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

To: vim_dev@googlegroups.com
Subject: Patch 7.3.1011
Fcc: outbox
From: Bram Moolenaar <Bram@moolenaar.net>
Mime-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
------------
Patch 7.3.1011
Problem: New regexp engine is inefficient with multi-byte characters.
Solution: Handle a character at a time instead of a byte at a time. Also
make \Z partly work.
Files: src/regexp_nfa.c, src/testdir/test95.in, src/testdir/test95.ok
*** ../vim-7.3.1010/src/regexp_nfa.c 2013-05-24 20:25:28.000000000 +0200
--- src/regexp_nfa.c 2013-05-24 21:49:43.000000000 +0200
***************
*** 46,54 ****
NFA_NCLOSE, /* End of subexpr. marked with \%( ... \) */
NFA_START_INVISIBLE,
NFA_END_INVISIBLE,
- NFA_MULTIBYTE, /* Next nodes in NFA are part of the same
- multibyte char */
- NFA_END_MULTIBYTE, /* End of multibyte char in the NFA */
NFA_COMPOSING, /* Next nodes in NFA are part of the
composing multibyte char */
NFA_END_COMPOSING, /* End of a composing char in the NFA */
--- 46,51 ----
***************
*** 195,220 ****
*post_ptr++ = c; \
} while (0)
- #define EMIT_MBYTE(c) \
- len = (*mb_char2bytes)(c, buf); \
- EMIT(buf[0]); \
- for (i = 1; i < len; i++) \
- { \
- EMIT(buf[i]); \
- EMIT(NFA_CONCAT); \
- } \
- EMIT(NFA_MULTIBYTE);
-
- #define EMIT_COMPOSING_UTF(input) \
- len = utfc_ptr2len(input); \
- EMIT(input[0]); \
- for (i = 1; i < len; i++) \
- { \
- EMIT(input[i]); \
- EMIT(NFA_CONCAT); \
- } \
- EMIT(NFA_COMPOSING);
-
/*
* Initialize internal variables before NFA compilation.
* Return OK on success, FAIL otherwise.
--- 192,197 ----
***************
*** 611,618 ****
#ifdef FEAT_MBYTE
char_u *old_regparse = regparse;
int clen;
- int len;
- static char_u buf[30];
int i;
#endif
int extra = 0;
--- 588,593 ----
***************
*** 845,858 ****
return FAIL;
c = coll_get_char();
! #ifdef FEAT_MBYTE
! if ((*mb_char2len)(c) > 1)
! {
! EMIT_MBYTE(c);
! }
! else
! #endif
! EMIT(c);
break;
/* Catch \%^ and \%$ regardless of where they appear in the
--- 820,826 ----
return FAIL;
c = coll_get_char();
! EMIT(c);
break;
/* Catch \%^ and \%$ regardless of where they appear in the
***************
*** 1135,1146 ****
* skip it. */
for (c = startc + 1; c <= endc; c++)
{
! if ((*mb_char2len)(c) > 1)
! {
! EMIT_MBYTE(c);
! }
! else
! EMIT(c);
TRY_NEG();
EMIT_GLUE();
}
--- 1103,1109 ----
* skip it. */
for (c = startc + 1; c <= endc; c++)
{
! EMIT(c);
TRY_NEG();
EMIT_GLUE();
}
***************
*** 1187,1200 ****
if (got_coll_char == TRUE && startc == 0)
EMIT(0x0a);
else
! #ifdef FEAT_MBYTE
! if ((*mb_char2len)(startc) > 1)
! {
! EMIT_MBYTE(startc);
! }
! else
! #endif
! EMIT(startc);
TRY_NEG();
EMIT_GLUE();
}
--- 1150,1156 ----
if (got_coll_char == TRUE && startc == 0)
EMIT(0x0a);
else
! EMIT(startc);
TRY_NEG();
EMIT_GLUE();
}
***************
*** 1242,1271 ****
int plen;
nfa_do_multibyte:
! /* length of current char, with composing chars,
! * from pointer */
! plen = (*mb_ptr2len)(old_regparse);
! if (enc_utf8 && clen != plen)
! {
! /* A composing character is always handled as a
! * separate atom, surrounded by NFA_COMPOSING and
! * NFA_END_COMPOSING. Note that right now we are
* building the postfix form, not the NFA itself;
* a composing char could be: a, b, c, NFA_COMPOSING
! * where 'a', 'b', 'c' are chars with codes > 256.
! */
! EMIT_COMPOSING_UTF(old_regparse);
regparse = old_regparse + plen;
}
else
- /* A multi-byte character is always handled as a
- * separate atom, surrounded by NFA_MULTIBYTE and
- * NFA_END_MULTIBYTE */
- if (plen > 1)
- {
- EMIT_MBYTE(c);
- }
- else
#endif
{
c = no_Magic(c);
--- 1198,1227 ----
int plen;
nfa_do_multibyte:
! /* Length of current char with composing chars. */
! if (enc_utf8 && clen != (plen = (*mb_ptr2len)(old_regparse)))
! {
! /* A base character plus composing characters.
! * This requires creating a separate atom as if enclosing
! * the characters in (), where NFA_COMPOSING is the ( and
! * NFA_END_COMPOSING is the ). Note that right now we are
* building the postfix form, not the NFA itself;
* a composing char could be: a, b, c, NFA_COMPOSING
! * where 'b' and 'c' are chars with codes > 256. */
! i = 0;
! for (;;)
! {
! EMIT(c);
! if (i > 0)
! EMIT(NFA_CONCAT);
! if (i += utf_char2len(c) >= plen)
! break;
! c = utf_ptr2char(old_regparse + i);
! }
! EMIT(NFA_COMPOSING);
regparse = old_regparse + plen;
}
else
#endif
{
c = no_Magic(c);
***************
*** 1702,1710 ****
case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break;
case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break;
- case NFA_MULTIBYTE: STRCPY(code, "NFA_MULTIBYTE"); break;
- case NFA_END_MULTIBYTE: STRCPY(code, "NFA_END_MULTIBYTE"); break;
-
case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break;
case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING"); break;
--- 1658,1663 ----
***************
*** 2194,2200 ****
}
e1 = POP();
e1.start->negated = TRUE;
! if (e1.start->c == NFA_MULTIBYTE || e1.start->c == NFA_COMPOSING)
e1.start->out1->negated = TRUE;
PUSH(e1);
break;
--- 2147,2153 ----
}
e1 = POP();
e1.start->negated = TRUE;
! if (e1.start->c == NFA_COMPOSING)
e1.start->out1->negated = TRUE;
PUSH(e1);
break;
***************
*** 2311,2316 ****
--- 2264,2279 ----
PUSH(frag(s, list1(&s1->out)));
break;
+ case NFA_COMPOSING: /* char with composing char */
+ #if 0
+ /* TODO */
+ if (regflags & RF_ICOMBINE)
+ {
+ goto normalchar;
+ }
+ #endif
+ /* FALLTHROUGH */
+
case NFA_MOPEN + 0: /* Submatch */
case NFA_MOPEN + 1:
case NFA_MOPEN + 2:
***************
*** 2322,2329 ****
case NFA_MOPEN + 8:
case NFA_MOPEN + 9:
case NFA_NOPEN: /* \%( "Invisible Submatch" */
- case NFA_MULTIBYTE: /* mbyte char */
- case NFA_COMPOSING: /* composing char */
if (nfa_calc_size == TRUE)
{
nstate += 2;
--- 2285,2290 ----
***************
*** 2336,2344 ****
case NFA_NOPEN:
mclose = NFA_NCLOSE;
break;
- case NFA_MULTIBYTE:
- mclose = NFA_END_MULTIBYTE;
- break;
case NFA_COMPOSING:
mclose = NFA_END_COMPOSING;
break;
--- 2297,2302 ----
***************
*** 2377,2385 ****
goto theend;
patch(e.out, s1);
! if (mopen == NFA_MULTIBYTE || mopen == NFA_COMPOSING)
! /* MULTIBYTE->out1 = END_MULTIBYTE
! * COMPOSING->out1 = END_COMPOSING */
patch(list1(&s->out1), s1);
PUSH(frag(s, list1(&s1->out)));
--- 2335,2342 ----
goto theend;
patch(e.out, s1);
! if (mopen == NFA_COMPOSING)
! /* COMPOSING->out1 = END_COMPOSING */
patch(list1(&s->out1), s1);
PUSH(frag(s, list1(&s1->out)));
***************
*** 2540,2556 ****
case NFA_COMPOSING:
/* nfa_regmatch() will match all the bytes of this composing char. */
break;
-
- case NFA_MULTIBYTE:
- /* nfa_regmatch() will match all the bytes of this multibyte char. */
- break;
#endif
- case NFA_END_MULTIBYTE:
- /* Successfully matched this mbyte char */
- addstate(l, state->out, m, off, lid, match);
- break;
-
case NFA_NOPEN:
case NFA_NCLOSE:
addstate(l, state->out, m, off, lid, match);
--- 2497,2504 ----
***************
*** 2841,2847 ****
regsub_T *submatch;
regsub_T *m;
{
! int c = -1;
int n;
int i = 0;
int result;
--- 2789,2795 ----
regsub_T *submatch;
regsub_T *m;
{
! int c;
int n;
int i = 0;
int result;
***************
*** 2859,2865 ****
List *listtbl[2][2];
List *ll;
int listid = 1;
- int endnode;
List *thislist;
List *nextlist;
List *neglist;
--- 2807,2812 ----
***************
*** 3190,3222 ****
break;
}
! case NFA_MULTIBYTE:
case NFA_COMPOSING:
! endnode = t->state->c + 1;
result = OK;
sta = t->state->out;
! len = 1;
! while (sta->c != endnode && len <= n)
{
! if (reginput[len-1] != sta->c)
! {
! result = FAIL;
break;
! }
! len++;
sta = sta->out;
}
/* if input char length doesn't match regexp char length */
! if (len -1 < n || sta->c != endnode)
result = FAIL;
! end = t->state->out1; /* NFA_END_MULTIBYTE or
! NFA_END_COMPOSING */
/* If \Z was present, then ignore composing characters */
! if (ireg_icombine && endnode == NFA_END_COMPOSING)
result = 1 ^ sta->negated;
ADD_POS_NEG_STATE(end);
break;
case NFA_NEWL:
if (!reg_line_lbr && REG_MULTI
--- 3137,3171 ----
break;
}
! #ifdef FEAT_MBYTE
case NFA_COMPOSING:
! {
! int mc = c;
!
result = OK;
sta = t->state->out;
! len = 0;
! while (sta->c != NFA_END_COMPOSING && len < n)
{
! if (len > 0)
! mc = mb_ptr2char(reginput + len);
! if (mc != sta->c)
break;
! len += mb_char2len(mc);
sta = sta->out;
}
/* if input char length doesn't match regexp char length */
! if (len < n || sta->c != NFA_END_COMPOSING)
result = FAIL;
! end = t->state->out1; /* NFA_END_COMPOSING */
/* If \Z was present, then ignore composing characters */
! if (ireg_icombine)
result = 1 ^ sta->negated;
ADD_POS_NEG_STATE(end);
break;
+ }
+ #endif
case NFA_NEWL:
if (!reg_line_lbr && REG_MULTI
***************
*** 3425,3430 ****
--- 3374,3387 ----
if (!result)
result = ireg_ic == TRUE
&& MB_TOLOWER(t->state->c) == MB_TOLOWER(c);
+ #ifdef FEAT_MBYTE
+ /* If there is a composing character which is not being
+ * ignored there can be no match. Match with composing
+ * character uses NFA_COMPOSING above. */
+ if (result && enc_utf8 && !ireg_icombine
+ && n != utf_char2len(c))
+ result = FALSE;
+ #endif
ADD_POS_NEG_STATE(t->state);
break;
}
*** ../vim-7.3.1010/src/testdir/test95.in 2013-05-24 20:25:28.000000000 +0200
--- src/testdir/test95.in 2013-05-24 20:45:08.000000000 +0200
***************
*** 35,40 ****
--- 35,44 ----
:call add(tl, ['\f\+', '&*Ÿfname ', 'fname'])
:call add(tl, ['\%#=1\f\+', '&*Ÿfname ', 'fname'])
+ :"""" Test composing character matching
+ :call add(tl, ['.ม', 'xม่x yมy', 'yม'])
+ :call add(tl, ['.ม่', 'xม่x yมy', 'xม่'])
+
:"""" Test \Z
:call add(tl, ['ú\Z', 'x'])
*** ../vim-7.3.1010/src/testdir/test95.ok 2013-05-24 20:25:28.000000000 +0200
--- src/testdir/test95.ok 2013-05-24 20:44:41.000000000 +0200
***************
*** 9,13 ****
--- 9,15 ----
OK - \%#=1\i\+
OK - \f\+
OK - \%#=1\f\+
+ OK - .ม
+ OK - .ม่
OK - ú\Z
OK - [^[=a=]]\+
*** ../vim-7.3.1010/src/version.c 2013-05-24 20:25:28.000000000 +0200
--- src/version.c 2013-05-24 21:56:02.000000000 +0200
***************
*** 730,731 ****
--- 730,733 ----
{ /* Add new patch number below this line */
+ /**/
+ 1011,
/**/
--
If you had to identify, in one word, the reason why the
human race has not achieved, and never will achieve, its
full potential, that word would be "meetings."
/// Bram Moolenaar -- Bram@Moolenaar.net -- http://www.Moolenaar.net \\\
/// sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\
\\\ an exciting new programming language -- http://www.Zimbu.org ///
\\\ help me help AIDS victims -- http://ICCF-Holland.org ///