vim/7.3.1011

To: vim_dev@googlegroups.com
Subject: Patch 7.3.1011
Fcc: outbox
From: Bram Moolenaar <Bram@moolenaar.net>
Mime-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
------------

Patch 7.3.1011
Problem:    New regexp engine is inefficient with multi-byte characters.
Solution:   Handle a character at a time instead of a byte at a time.  Also
            make \Z partly work.
Files:      src/regexp_nfa.c, src/testdir/test95.in, src/testdir/test95.ok


*** ../vim-7.3.1010/src/regexp_nfa.c	2013-05-24 20:25:28.000000000 +0200
--- src/regexp_nfa.c	2013-05-24 21:49:43.000000000 +0200
***************
*** 46,54 ****
      NFA_NCLOSE,			    /* End of subexpr. marked with \%( ... \) */
      NFA_START_INVISIBLE,
      NFA_END_INVISIBLE,
-     NFA_MULTIBYTE,		    /* Next nodes in NFA are part of the same
- 				       multibyte char */
-     NFA_END_MULTIBYTE,		    /* End of multibyte char in the NFA */
      NFA_COMPOSING,		    /* Next nodes in NFA are part of the
  				       composing multibyte char */
      NFA_END_COMPOSING,		    /* End of a composing char in the NFA */
--- 46,51 ----
***************
*** 195,220 ****
  		    *post_ptr++ = c;		\
  		} while (0)

- #define EMIT_MBYTE(c)					    \
- 			len = (*mb_char2bytes)(c, buf);	    \
- 			EMIT(buf[0]);			    \
- 			for (i = 1; i < len; i++)	    \
- 			{				    \
- 			    EMIT(buf[i]);		    \
- 			    EMIT(NFA_CONCAT);		    \
- 			}				    \
- 			EMIT(NFA_MULTIBYTE);
-
- #define EMIT_COMPOSING_UTF(input)			    \
- 			len = utfc_ptr2len(input);	    \
- 			EMIT(input[0]);			    \
- 			for (i = 1; i < len; i++)	    \
- 			{				    \
- 			    EMIT(input[i]);		    \
- 			    EMIT(NFA_CONCAT);		    \
- 			}				    \
- 			EMIT(NFA_COMPOSING);
-
  /*
   * Initialize internal variables before NFA compilation.
   * Return OK on success, FAIL otherwise.
--- 192,197 ----
***************
*** 611,618 ****
  #ifdef FEAT_MBYTE
      char_u	*old_regparse = regparse;
      int		clen;
-     int		len;
-     static char_u	buf[30];
      int		i;
  #endif
      int		extra = 0;
--- 588,593 ----
***************
*** 845,858 ****
  		    return FAIL;

  		    c = coll_get_char();
! #ifdef FEAT_MBYTE
! 		    if ((*mb_char2len)(c) > 1)
! 		    {
! 			EMIT_MBYTE(c);
! 		    }
! 		    else
! #endif
! 			EMIT(c);
  		    break;

  		/* Catch \%^ and \%$ regardless of where they appear in the
--- 820,826 ----
  		    return FAIL;

  		    c = coll_get_char();
! 		    EMIT(c);
  		    break;

  		/* Catch \%^ and \%$ regardless of where they appear in the
***************
*** 1135,1146 ****
  			     * skip it. */
  			    for (c = startc + 1; c <= endc; c++)
  			    {
! 				if ((*mb_char2len)(c) > 1)
! 				{
! 				    EMIT_MBYTE(c);
! 				}
! 				else
! 				    EMIT(c);
  				TRY_NEG();
  				EMIT_GLUE();
  			    }
--- 1103,1109 ----
  			     * skip it. */
  			    for (c = startc + 1; c <= endc; c++)
  			    {
! 				EMIT(c);
  				TRY_NEG();
  				EMIT_GLUE();
  			    }
***************
*** 1187,1200 ****
  			if (got_coll_char == TRUE && startc == 0)
  			    EMIT(0x0a);
  			else
! #ifdef FEAT_MBYTE
! 			    if ((*mb_char2len)(startc) > 1)
! 			    {
! 				EMIT_MBYTE(startc);
! 			    }
! 			    else
! #endif
! 				EMIT(startc);
  			TRY_NEG();
  			EMIT_GLUE();
  		    }
--- 1150,1156 ----
  			if (got_coll_char == TRUE && startc == 0)
  			    EMIT(0x0a);
  			else
! 			    EMIT(startc);
  			TRY_NEG();
  			EMIT_GLUE();
  		    }
***************
*** 1242,1271 ****
  		int	plen;

  nfa_do_multibyte:
! 		/* length of current char, with composing chars,
! 		 * from pointer */
! 		plen = (*mb_ptr2len)(old_regparse);
! 		if (enc_utf8 && clen != plen)
! 		{
! 		    /* A composing character is always handled as a
! 		     * separate atom, surrounded by NFA_COMPOSING and
! 		     * NFA_END_COMPOSING. Note that right now we are
  		     * building the postfix form, not the NFA itself;
  		     * a composing char could be: a, b, c, NFA_COMPOSING
! 		     * where 'a', 'b', 'c' are chars with codes > 256.
! 		     */
! 		    EMIT_COMPOSING_UTF(old_regparse);
  		    regparse = old_regparse + plen;
  		}
  		else
- 		    /* A multi-byte character is always handled as a
- 		     * separate atom, surrounded by NFA_MULTIBYTE and
- 		     * NFA_END_MULTIBYTE */
- 		    if (plen > 1)
- 		    {
- 			EMIT_MBYTE(c);
- 		    }
- 		    else
  #endif
  		{
  		    c = no_Magic(c);
--- 1198,1227 ----
  		int	plen;

  nfa_do_multibyte:
! 		/* Length of current char with composing chars. */
! 		if (enc_utf8 && clen != (plen = (*mb_ptr2len)(old_regparse)))
! 		{
! 		    /* A base character plus composing characters.
! 		     * This requires creating a separate atom as if enclosing
! 		     * the characters in (), where NFA_COMPOSING is the ( and
! 		     * NFA_END_COMPOSING is the ). Note that right now we are
  		     * building the postfix form, not the NFA itself;
  		     * a composing char could be: a, b, c, NFA_COMPOSING
! 		     * where 'b' and 'c' are chars with codes > 256. */
! 		    i = 0;
! 		    for (;;)
! 		    {
! 			EMIT(c);
! 			if (i > 0)
! 			    EMIT(NFA_CONCAT);
! 			if (i += utf_char2len(c) >= plen)
! 			    break;
! 			c = utf_ptr2char(old_regparse + i);
! 		    }
! 		    EMIT(NFA_COMPOSING);
  		    regparse = old_regparse + plen;
  		}
  		else
  #endif
  		{
  		    c = no_Magic(c);
***************
*** 1702,1710 ****
  	case NFA_START_INVISIBLE:   STRCPY(code, "NFA_START_INVISIBLE"); break;
  	case NFA_END_INVISIBLE:	    STRCPY(code, "NFA_END_INVISIBLE"); break;

- 	case NFA_MULTIBYTE:	    STRCPY(code, "NFA_MULTIBYTE"); break;
- 	case NFA_END_MULTIBYTE:	    STRCPY(code, "NFA_END_MULTIBYTE"); break;
-
  	case NFA_COMPOSING:	    STRCPY(code, "NFA_COMPOSING"); break;
  	case NFA_END_COMPOSING:	    STRCPY(code, "NFA_END_COMPOSING"); break;

--- 1658,1663 ----
***************
*** 2194,2200 ****
  	    }
  	    e1 = POP();
  	    e1.start->negated = TRUE;
! 	    if (e1.start->c == NFA_MULTIBYTE || e1.start->c == NFA_COMPOSING)
  		e1.start->out1->negated = TRUE;
  	    PUSH(e1);
  	    break;
--- 2147,2153 ----
  	    }
  	    e1 = POP();
  	    e1.start->negated = TRUE;
! 	    if (e1.start->c == NFA_COMPOSING)
  		e1.start->out1->negated = TRUE;
  	    PUSH(e1);
  	    break;
***************
*** 2311,2316 ****
--- 2264,2279 ----
  	    PUSH(frag(s, list1(&s1->out)));
  	    break;

+ 	case NFA_COMPOSING:	/* char with composing char */
+ #if 0
+ 	    /* TODO */
+ 	    if (regflags & RF_ICOMBINE)
+ 	    {
+ 		goto normalchar;
+ 	    }
+ #endif
+ 	    /* FALLTHROUGH */
+
  	case NFA_MOPEN + 0:	/* Submatch */
  	case NFA_MOPEN + 1:
  	case NFA_MOPEN + 2:
***************
*** 2322,2329 ****
  	case NFA_MOPEN + 8:
  	case NFA_MOPEN + 9:
  	case NFA_NOPEN:		/* \%( "Invisible Submatch" */
- 	case NFA_MULTIBYTE:	/* mbyte char */
- 	case NFA_COMPOSING:	/* composing char */
  	    if (nfa_calc_size == TRUE)
  	    {
  		nstate += 2;
--- 2285,2290 ----
***************
*** 2336,2344 ****
  		case NFA_NOPEN:
  		    mclose = NFA_NCLOSE;
  		    break;
- 		case NFA_MULTIBYTE:
- 		    mclose = NFA_END_MULTIBYTE;
- 		    break;
  		case NFA_COMPOSING:
  		    mclose = NFA_END_COMPOSING;
  		    break;
--- 2297,2302 ----
***************
*** 2377,2385 ****
  		goto theend;
  	    patch(e.out, s1);

! 	    if (mopen == NFA_MULTIBYTE || mopen == NFA_COMPOSING)
! 		/* MULTIBYTE->out1 = END_MULTIBYTE
! 		* COMPOSING->out1 = END_COMPOSING */
  		patch(list1(&s->out1), s1);

  	    PUSH(frag(s, list1(&s1->out)));
--- 2335,2342 ----
  		goto theend;
  	    patch(e.out, s1);

! 	    if (mopen == NFA_COMPOSING)
! 		/* COMPOSING->out1 = END_COMPOSING */
  		patch(list1(&s->out1), s1);

  	    PUSH(frag(s, list1(&s1->out)));
***************
*** 2540,2556 ****
  	case NFA_COMPOSING:
  	    /* nfa_regmatch() will match all the bytes of this composing char. */
  	    break;
-
- 	case NFA_MULTIBYTE:
- 	    /* nfa_regmatch() will match all the bytes of this multibyte char. */
- 	    break;
  #endif

- 	case NFA_END_MULTIBYTE:
- 	    /* Successfully matched this mbyte char */
- 	    addstate(l, state->out, m, off, lid, match);
- 	    break;
-
  	case NFA_NOPEN:
  	case NFA_NCLOSE:
  	    addstate(l, state->out, m, off, lid, match);
--- 2497,2504 ----
***************
*** 2841,2847 ****
      regsub_T		*submatch;
      regsub_T		*m;
  {
!     int		c = -1;
      int		n;
      int		i = 0;
      int		result;
--- 2789,2795 ----
      regsub_T		*submatch;
      regsub_T		*m;
  {
!     int		c;
      int		n;
      int		i = 0;
      int		result;
***************
*** 2859,2865 ****
      List	*listtbl[2][2];
      List	*ll;
      int		listid = 1;
-     int		endnode;
      List	*thislist;
      List	*nextlist;
      List	*neglist;
--- 2807,2812 ----
***************
*** 3190,3222 ****
  		break;
  	    }

! 	    case NFA_MULTIBYTE:
  	    case NFA_COMPOSING:
! 	        endnode = t->state->c + 1;
  		result = OK;
  		sta = t->state->out;
! 		len = 1;
! 		while (sta->c != endnode && len <= n)
  		{
! 		    if (reginput[len-1] != sta->c)
! 		    {
! 			result = FAIL;
  			break;
! 		    }
! 		    len++;
  		    sta = sta->out;
  		}

  		/* if input char length doesn't match regexp char length */
! 		if (len -1 < n || sta->c != endnode)
  		    result = FAIL;
! 		end = t->state->out1;	    /* NFA_END_MULTIBYTE or
! 					       NFA_END_COMPOSING */
  		/* If \Z was present, then ignore composing characters */
! 		if (ireg_icombine && endnode == NFA_END_COMPOSING)
  		    result = 1 ^ sta->negated;
  		ADD_POS_NEG_STATE(end);
  		break;

  	    case NFA_NEWL:
  		if (!reg_line_lbr && REG_MULTI
--- 3137,3171 ----
  		break;
  	    }

! #ifdef FEAT_MBYTE
  	    case NFA_COMPOSING:
! 	    {
! 		int mc = c;
!
  		result = OK;
  		sta = t->state->out;
! 		len = 0;
! 		while (sta->c != NFA_END_COMPOSING && len < n)
  		{
! 		    if (len > 0)
! 			mc = mb_ptr2char(reginput + len);
! 		    if (mc != sta->c)
  			break;
! 		    len += mb_char2len(mc);
  		    sta = sta->out;
  		}

  		/* if input char length doesn't match regexp char length */
! 		if (len < n || sta->c != NFA_END_COMPOSING)
  		    result = FAIL;
! 		end = t->state->out1;	    /* NFA_END_COMPOSING */
  		/* If \Z was present, then ignore composing characters */
! 		if (ireg_icombine)
  		    result = 1 ^ sta->negated;
  		ADD_POS_NEG_STATE(end);
  		break;
+ 	    }
+ #endif

  	    case NFA_NEWL:
  		if (!reg_line_lbr && REG_MULTI
***************
*** 3425,3430 ****
--- 3374,3387 ----
  		if (!result)
  		    result = ireg_ic == TRUE
  				&& MB_TOLOWER(t->state->c) == MB_TOLOWER(c);
+ #ifdef FEAT_MBYTE
+ 		/* If there is a composing character which is not being
+ 		 * ignored there can be no match. Match with composing
+ 		 * character uses NFA_COMPOSING above. */
+ 		if (result && enc_utf8 && !ireg_icombine
+ 						      && n != utf_char2len(c))
+ 		    result = FALSE;
+ #endif
  		ADD_POS_NEG_STATE(t->state);
  		break;
  	    }
*** ../vim-7.3.1010/src/testdir/test95.in	2013-05-24 20:25:28.000000000 +0200
--- src/testdir/test95.in	2013-05-24 20:45:08.000000000 +0200
***************
*** 35,40 ****
--- 35,44 ----
  :call add(tl, ['\f\+', '&*fname ', 'fname'])
  :call add(tl, ['\%#=1\f\+', '&*fname ', 'fname'])

+ :"""" Test composing character matching
+ :call add(tl, ['.ม', 'xม่x yมy', 'yม'])
+ :call add(tl, ['.ม่', 'xม่x yมy', 'xม่'])
+
  :"""" Test \Z
  :call add(tl, ['ú\Z', 'x'])

*** ../vim-7.3.1010/src/testdir/test95.ok	2013-05-24 20:25:28.000000000 +0200
--- src/testdir/test95.ok	2013-05-24 20:44:41.000000000 +0200
***************
*** 9,13 ****
--- 9,15 ----
  OK - \%#=1\i\+
  OK - \f\+
  OK - \%#=1\f\+
+ OK - .ม
+ OK - .ม่
  OK - ú\Z
  OK - [^[=a=]]\+
*** ../vim-7.3.1010/src/version.c	2013-05-24 20:25:28.000000000 +0200
--- src/version.c	2013-05-24 21:56:02.000000000 +0200
***************
*** 730,731 ****
--- 730,733 ----
  {   /* Add new patch number below this line */
+ /**/
+     1011,
  /**/

--
If you had to identify, in one word, the reason why the
human race has not achieved, and never will achieve, its
full potential, that word would be "meetings."

 /// Bram Moolenaar -- Bram@Moolenaar.net -- http://www.Moolenaar.net   \\\
///        sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\
\\\  an exciting new programming language -- http://www.Zimbu.org        ///
 \\\            help me help AIDS victims -- http://ICCF-Holland.org    ///