Fix caseless reference

2011-05-09 13:58:00 +02:00 · 2011-05-09 13:58:00 +02:00 · 54a4973709
commit 54a4973709
parent abbcae7348
2 changed files with 560 additions and 1 deletions
--- a/pcre-8.12-caseless_reference.patch
+++ b/pcre-8.12-caseless_reference.patch
@ -0,0 +1,552 @@
 r595 | ph10 | 2011-05-02 12:33:29 +0200 (Po, 02 kvě 2011) | 3 lines
 Fix problems with caseless reference matching in UTF-8 mode when the 
 upper/lower case characters have different lengths.
 and 
 r597 | ph10 | 2011-05-02 19:08:52 +0200 (Po, 02 kvě 2011) | 2 lines
 Complete incomplete fix for UTF-8 caseless references of different lengths.
 http://bugs.exim.org/show_bug.cgi?id=1074
 Petr Pisar: Changelog and comment changes removed.
 Index: testdata/testoutput12
 ===================================================================
 --- testdata/testoutput12	(revision 594)
 +++ testdata/testoutput12	(revision 595)
@@ -1176,4 +1176,64 @@
         End
 ------------------------------------------------------------------
 +/-- These behaved oddly in Perl, so they are kept in this test --/
 +
 +/(\x{23a}\x{23a}\x{23a})?\1/8i
 +    \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}
 +No match
 +
 +/(ȺȺȺ)?\1/8i
 +    ȺȺȺⱥⱥ
 +No match
 +
 +/(\x{23a}\x{23a}\x{23a})?\1/8i
 +    \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
 + 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
 + 1: \x{23a}\x{23a}\x{23a}
 +
 +/(ȺȺȺ)?\1/8i
 +    ȺȺȺⱥⱥⱥ
 + 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
 + 1: \x{23a}\x{23a}\x{23a}
 +
 +/(\x{23a}\x{23a}\x{23a})\1/8i
 +    \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}
 +No match
 +
 +/(ȺȺȺ)\1/8i
 +    ȺȺȺⱥⱥ
 +No match
 +
 +/(\x{23a}\x{23a}\x{23a})\1/8i
 +    \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
 + 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
 + 1: \x{23a}\x{23a}\x{23a}
 +
 +/(ȺȺȺ)\1/8i
 +    ȺȺȺⱥⱥⱥ
 + 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
 + 1: \x{23a}\x{23a}\x{23a}
 +
 +/(\x{2c65}\x{2c65})\1/8i
 +    \x{2c65}\x{2c65}\x{23a}\x{23a}
 + 0: \x{2c65}\x{2c65}\x{23a}\x{23a}
 + 1: \x{2c65}\x{2c65}
 +    
 +/(ⱥⱥ)\1/8i
 +    ⱥⱥȺȺ 
 + 0: \x{2c65}\x{2c65}\x{23a}\x{23a}
 + 1: \x{2c65}\x{2c65}
 +    
 +/(\x{23a}\x{23a}\x{23a})\1Y/8i
 +    X\x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}YZ
 + 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}Y
 + 1: \x{23a}\x{23a}\x{23a}
 +
 +/(\x{2c65}\x{2c65})\1Y/8i
 +    X\x{2c65}\x{2c65}\x{23a}\x{23a}YZ
 + 0: \x{2c65}\x{2c65}\x{23a}\x{23a}Y
 + 1: \x{2c65}\x{2c65}
 +
 +/-- --/ 
 +
 /-- End of testinput12 --/
 Index: testdata/testinput12
 ===================================================================
 --- testdata/testinput12	(revision 594)
 +++ testdata/testinput12	(revision 595)
@@ -503,4 +503,44 @@
 /A+\p{N}A+\dB+\p{N}*B+\d*/WBZ
 +/-- These behaved oddly in Perl, so they are kept in this test --/
 +
 +/(\x{23a}\x{23a}\x{23a})?\1/8i
 +    \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}
 +
 +/(ȺȺȺ)?\1/8i
 +    ȺȺȺⱥⱥ
 +
 +/(\x{23a}\x{23a}\x{23a})?\1/8i
 +    \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
 +
 +/(ȺȺȺ)?\1/8i
 +    ȺȺȺⱥⱥⱥ
 +
 +/(\x{23a}\x{23a}\x{23a})\1/8i
 +    \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}
 +
 +/(ȺȺȺ)\1/8i
 +    ȺȺȺⱥⱥ
 +
 +/(\x{23a}\x{23a}\x{23a})\1/8i
 +    \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
 +
 +/(ȺȺȺ)\1/8i
 +    ȺȺȺⱥⱥⱥ
 +
 +/(\x{2c65}\x{2c65})\1/8i
 +    \x{2c65}\x{2c65}\x{23a}\x{23a}
 +    
 +/(ⱥⱥ)\1/8i
 +    ⱥⱥȺȺ 
 +    
 +/(\x{23a}\x{23a}\x{23a})\1Y/8i
 +    X\x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}YZ
 +
 +/(\x{2c65}\x{2c65})\1Y/8i
 +    X\x{2c65}\x{2c65}\x{23a}\x{23a}YZ
 +
 +/-- --/ 
 +
 /-- End of testinput12 --/
 Index: pcre_exec.c
 ===================================================================
 --- pcre_exec.c	(revision 594)
 +++ pcre_exec.c	(revision 595)
@@ -132,24 +132,27 @@
 *          Match a back-reference                *
 *************************************************/
 -/* If a back reference hasn't been set, the length that is passed is greater
 -than the number of characters left in the string, so the match fails.
 +/* Normally, if a back reference hasn't been set, the length that is passed is
 +negative, so the match always fails. However, in JavaScript compatibility mode,
 +the length passed is zero. Note that in caseless UTF-8 mode, the number of 
 +subject bytes matched may be different to the number of reference bytes.
 Arguments:
   offset      index into the offset vector
 -  eptr        points into the subject
 -  length      length to be matched
 +  eptr        pointer into the subject
 +  length      length of reference to be matched (number of bytes)
   md          points to match data block
   ims         the ims flags
 -Returns:      TRUE if matched
 +Returns:      < 0 if not matched, otherwise the number of subject bytes matched
 */
 -static BOOL
 +static int
 match_ref(int offset, register USPTR eptr, int length, match_data *md,
   unsigned long int ims)
 {
 -USPTR p = md->start_subject + md->offset_vector[offset];
 +USPTR eptr_start = eptr;
 +register USPTR p = md->start_subject + md->offset_vector[offset];
 #ifdef PCRE_DEBUG
 if (eptr >= md->end_subject)
@@ -164,9 +167,9 @@
 printf("\n");
 #endif
 -/* Always fail if not enough characters left */
 +/* Always fail if reference not set (and not JavaScript compatible). */
 -if (length > md->end_subject - eptr) return FALSE;
 +if (length < 0) return -1;
 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
 properly if Unicode properties are supported. Otherwise, we can check only
@@ -178,13 +181,21 @@
 #ifdef SUPPORT_UCP
   if (md->utf8)
     {
 -    USPTR endptr = eptr + length;
 -    while (eptr < endptr)
 +    /* Match characters up to the end of the reference. NOTE: the number of 
 +    bytes matched may differ, because there are some characters whose upper and
 +    lower case versions code as different numbers of bytes. For example, U+023A
 +    (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
 +    a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
 +    the latter. It is important, therefore, to check the length along the 
 +    reference, not along the subject (earlier code did this wrong). */
 + 
 +    USPTR endptr = p + length;
 +    while (p < endptr)
       {
       int c, d;
       GETCHARINC(c, eptr);
       GETCHARINC(d, p);
 -      if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
 +      if (c != d && c != UCD_OTHERCASE(d)) return -1;
       }
     }
   else
@@ -195,16 +206,16 @@
   is no UCP support. */
   while (length-- > 0)
 -    { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
 +    { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
   }
 /* In the caseful case, we can just compare the bytes, whether or not we
 are in UTF-8 mode. */
 else
 -  { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
 +  { while (length-- > 0) if (*p++ != *eptr++) return -1; }
 -return TRUE;
 +return eptr - eptr_start;
 }
@@ -2252,129 +2263,129 @@
     loops). */
     case OP_REF:
 -      {
 -      offset = GET2(ecode, 1) << 1;               /* Doubled ref number */
 -      ecode += 3;
 +    offset = GET2(ecode, 1) << 1;               /* Doubled ref number */
 +    ecode += 3;
 -      /* If the reference is unset, there are two possibilities:
 +    /* If the reference is unset, there are two possibilities:
 -      (a) In the default, Perl-compatible state, set the length to be longer
 -      than the amount of subject left; this ensures that every attempt at a
 -      match fails. We can't just fail here, because of the possibility of
 -      quantifiers with zero minima.
 +    (a) In the default, Perl-compatible state, set the length negative;
 +    this ensures that every attempt at a match fails. We can't just fail
 +    here, because of the possibility of quantifiers with zero minima.
 -      (b) If the JavaScript compatibility flag is set, set the length to zero
 -      so that the back reference matches an empty string.
 +    (b) If the JavaScript compatibility flag is set, set the length to zero
 +    so that the back reference matches an empty string.
 -      Otherwise, set the length to the length of what was matched by the
 -      referenced subpattern. */
 +    Otherwise, set the length to the length of what was matched by the
 +    referenced subpattern. */
 -      if (offset >= offset_top || md->offset_vector[offset] < 0)
 -        length = (md->jscript_compat)? 0 : (int)(md->end_subject - eptr + 1);
 -      else
 -        length = md->offset_vector[offset+1] - md->offset_vector[offset];
 +    if (offset >= offset_top || md->offset_vector[offset] < 0)
 +      length = (md->jscript_compat)? 0 : -1;
 +    else
 +      length = md->offset_vector[offset+1] - md->offset_vector[offset];
 -      /* Set up for repetition, or handle the non-repeated case */
 +    /* Set up for repetition, or handle the non-repeated case */
 -      switch (*ecode)
 -        {
 -        case OP_CRSTAR:
 -        case OP_CRMINSTAR:
 -        case OP_CRPLUS:
 -        case OP_CRMINPLUS:
 -        case OP_CRQUERY:
 -        case OP_CRMINQUERY:
 -        c = *ecode++ - OP_CRSTAR;
 -        minimize = (c & 1) != 0;
 -        min = rep_min[c];                 /* Pick up values from tables; */
 -        max = rep_max[c];                 /* zero for max => infinity */
 -        if (max == 0) max = INT_MAX;
 -        break;
 +    switch (*ecode)
 +      {
 +      case OP_CRSTAR:
 +      case OP_CRMINSTAR:
 +      case OP_CRPLUS:
 +      case OP_CRMINPLUS:
 +      case OP_CRQUERY:
 +      case OP_CRMINQUERY:
 +      c = *ecode++ - OP_CRSTAR;
 +      minimize = (c & 1) != 0;
 +      min = rep_min[c];                 /* Pick up values from tables; */
 +      max = rep_max[c];                 /* zero for max => infinity */
 +      if (max == 0) max = INT_MAX;
 +      break;
 -        case OP_CRRANGE:
 -        case OP_CRMINRANGE:
 -        minimize = (*ecode == OP_CRMINRANGE);
 -        min = GET2(ecode, 1);
 -        max = GET2(ecode, 3);
 -        if (max == 0) max = INT_MAX;
 -        ecode += 5;
 -        break;
 +      case OP_CRRANGE:
 +      case OP_CRMINRANGE:
 +      minimize = (*ecode == OP_CRMINRANGE);
 +      min = GET2(ecode, 1);
 +      max = GET2(ecode, 3);
 +      if (max == 0) max = INT_MAX;
 +      ecode += 5;
 +      break;
 -        default:               /* No repeat follows */
 -        if (!match_ref(offset, eptr, length, md, ims))
 -          {
 -          CHECK_PARTIAL();
 -          MRRETURN(MATCH_NOMATCH);
 -          }
 -        eptr += length;
 -        continue;              /* With the main loop */
 +      default:               /* No repeat follows */
 +      if ((length = match_ref(offset, eptr, length, md, ims)) < 0)
 +        {
 +        CHECK_PARTIAL();
 +        MRRETURN(MATCH_NOMATCH);
         }
 +      eptr += length;
 +      continue;              /* With the main loop */
 +      }
 -      /* If the length of the reference is zero, just continue with the
 -      main loop. */
 +    /* Handle repeated back references. If the length of the reference is
 +    zero, just continue with the main loop. */
 -      if (length == 0) continue;
 +    if (length == 0) continue;
 -      /* First, ensure the minimum number of matches are present. We get back
 -      the length of the reference string explicitly rather than passing the
 -      address of eptr, so that eptr can be a register variable. */
 +    /* First, ensure the minimum number of matches are present. We get back
 +    the length of the reference string explicitly rather than passing the
 +    address of eptr, so that eptr can be a register variable. */
 -      for (i = 1; i <= min; i++)
 +    for (i = 1; i <= min; i++)
 +      {
 +      int slength; 
 +      if ((slength = match_ref(offset, eptr, length, md, ims)) < 0)
         {
 -        if (!match_ref(offset, eptr, length, md, ims))
 -          {
 -          CHECK_PARTIAL();
 -          MRRETURN(MATCH_NOMATCH);
 -          }
 -        eptr += length;
 +        CHECK_PARTIAL();
 +        MRRETURN(MATCH_NOMATCH);
         }
 +      eptr += slength;
 +      }
 -      /* If min = max, continue at the same level without recursion.
 -      They are not both allowed to be zero. */
 +    /* If min = max, continue at the same level without recursion.
 +    They are not both allowed to be zero. */
 -      if (min == max) continue;
 +    if (min == max) continue;
 -      /* If minimizing, keep trying and advancing the pointer */
 +    /* If minimizing, keep trying and advancing the pointer */
 -      if (minimize)
 +    if (minimize)
 +      {
 +      for (fi = min;; fi++)
         {
 -        for (fi = min;; fi++)
 +        int slength; 
 +        RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
 +        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
 +        if (fi >= max) MRRETURN(MATCH_NOMATCH);
 +        if ((slength = match_ref(offset, eptr, length, md, ims)) < 0)
           {
 -          RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
 -          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
 -          if (fi >= max) MRRETURN(MATCH_NOMATCH);
 -          if (!match_ref(offset, eptr, length, md, ims))
 -            {
 -            CHECK_PARTIAL();
 -            MRRETURN(MATCH_NOMATCH);
 -            }
 -          eptr += length;
 +          CHECK_PARTIAL();
 +          MRRETURN(MATCH_NOMATCH);
           }
 -        /* Control never gets here */
 +        eptr += slength;
         }
 +      /* Control never gets here */
 +      }
 -      /* If maximizing, find the longest string and work backwards */
 +    /* If maximizing, find the longest string and work backwards */
 -      else
 +    else
 +      {
 +      pp = eptr;
 +      for (i = min; i < max; i++)
         {
 -        pp = eptr;
 -        for (i = min; i < max; i++)
 +        int slength; 
 +        if ((slength = match_ref(offset, eptr, length, md, ims)) < 0)
           {
 -          if (!match_ref(offset, eptr, length, md, ims))
 -            {
 -            CHECK_PARTIAL();
 -            break;
 -            }
 -          eptr += length;
 +          CHECK_PARTIAL();
 +          break;
           }
 -        while (eptr >= pp)
 -          {
 -          RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
 -          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
 -          eptr -= length;
 -          }
 -        MRRETURN(MATCH_NOMATCH);
 +        eptr += slength;
         }
 +      while (eptr >= pp)
 +        {
 +        RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
 +        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
 +        eptr -= length;
 +        }
 +      MRRETURN(MATCH_NOMATCH);
       }
     /* Control never gets here */
 Index: testdata/testinput1
 ===================================================================
 --- testdata/testinput1	(revision 596)
 +++ testdata/testinput1	(revision 597)
@@ -4079,4 +4079,10 @@
 /^\c/
     ?
 +/(abc)\1/i
 +   abc
 +
 +/(abc)\1/
 +   abc
 +
 /-- End of testinput1 --/
 Index: testdata/testoutput1
 ===================================================================
 --- testdata/testoutput1	(revision 596)
 +++ testdata/testoutput1	(revision 597)
@@ -6666,4 +6666,12 @@
     ?
  0: ?
 +/(abc)\1/i
 +   abc
 +No match
 +
 +/(abc)\1/
 +   abc
 +No match
 +
 /-- End of testinput1 --/
 Index: testdata/testinput4
 ===================================================================
 --- testdata/testinput4	(revision 596)
 +++ testdata/testinput4	(revision 597)
@@ -644,4 +644,10 @@
 /A*/g8
     AAB\x{123}BAA
 +/(abc)\1/8i
 +   abc
 +
 +/(abc)\1/8
 +   abc
 +
 /-- End of testinput4 --/
 Index: testdata/testoutput4
 ===================================================================
 --- testdata/testoutput4	(revision 596)
 +++ testdata/testoutput4	(revision 597)
@@ -1128,4 +1128,12 @@
  0: AA
  0: 
 +/(abc)\1/8i
 +   abc
 +No match
 +
 +/(abc)\1/8
 +   abc
 +No match
 +
 /-- End of testinput4 --/
 Index: pcre_exec.c
 ===================================================================
 --- pcre_exec.c	(revision 596)
 +++ pcre_exec.c	(revision 597)
@@ -193,6 +193,7 @@
     while (p < endptr)
       {
       int c, d;
 +      if (eptr >= md->end_subject) return -1;
       GETCHARINC(c, eptr);
       GETCHARINC(d, p);
       if (c != d && c != UCD_OTHERCASE(d)) return -1;
@@ -204,16 +205,21 @@
   /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
   is no UCP support. */
 -
 -  while (length-- > 0)
 -    { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
 +    {
 +    if (eptr + length > md->end_subject) return -1; 
 +    while (length-- > 0)
 +      { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
 +    }   
   }
 /* In the caseful case, we can just compare the bytes, whether or not we
 are in UTF-8 mode. */
 else
 -  { while (length-- > 0) if (*p++ != *eptr++) return -1; }
 +  { 
 +  if (eptr + length > md->end_subject) return -1; 
 +  while (length-- > 0) if (*p++ != *eptr++) return -1; 
 +  }
 return eptr - eptr_start;
 }
--- a/pcre.spec
+++ b/pcre.spec
@ -1,6 +1,6 @@
 Name: pcre
 Version: 8.12
-Release: 3%{?dist}
+Release: 4%{?dist}
 Summary: Perl-compatible regular expression library
 Group: System Environment/Libraries
 License: BSD
@ -11,6 +11,8 @@ Patch0: pcre-8.10-multilib.patch
 Patch1: pcre-8.12-manual_typos.patch
 # Refused by upstream, bug #675477
 Patch2: pcre-8.12-refused_spelling_terminated.patch
 # In upstream, bug #702623
 Patch3: pcre-8.12-caseless_reference.patch
 # New libtool to get rid of rpath
 BuildRequires: autoconf, automake, libtool
@ -44,6 +46,7 @@ Library for static linking for %{name}.
 libtoolize --copy --force && autoreconf
 %patch1 -p0 -b .manual_typos
 %patch2 -p1 -b .terminated_typos
 %patch3 -p0 -b .caseless_reference
 # One contributor's name is non-UTF-8
 for F in ChangeLog; do
    iconv -f latin1 -t utf8 "$F" >"${F}.utf8"
@ -103,6 +106,10 @@ make check
 %doc COPYING LICENCE 
 %changelog
 * Mon May 09 2011 Petr Pisar <ppisar@redhat.com> - 8.12-4
 - Fix caseless reference matching in UTF-8 mode when the upper/lower case
  characters have different lengths (bug #702623)
 * Mon May 09 2011 Petr Pisar <ppisar@redhat.com> - 8.12-3
 - Fix typos in manual pages (bugs #675476, #675477)
 - Clean spec file up