Fix caseless reference

2011-05-09 13:58:00 +02:00 · 2011-05-09 13:58:00 +02:00 · 54a4973709
commit 54a4973709
parent abbcae7348
2 changed files with 560 additions and 1 deletions
--- a/pcre-8.12-caseless_reference.patch
+++ b/pcre-8.12-caseless_reference.patch
@ -0,0 +1,552 @@
+r595 | ph10 | 2011-05-02 12:33:29 +0200 (Po, 02 kvě 2011) | 3 lines
+Fix problems with caseless reference matching in UTF-8 mode when the 
+upper/lower case characters have different lengths.
+
+and 
+
+r597 | ph10 | 2011-05-02 19:08:52 +0200 (Po, 02 kvě 2011) | 2 lines
+Complete incomplete fix for UTF-8 caseless references of different lengths.
+
+http://bugs.exim.org/show_bug.cgi?id=1074
+
+Petr Pisar: Changelog and comment changes removed.
+
+Index: testdata/testoutput12
+===================================================================
+--- testdata/testoutput12	(revision 594)
+++ testdata/testoutput12	(revision 595)
+@@ -1176,4 +1176,64 @@
+         End
+ ------------------------------------------------------------------
+ 
+/-- These behaved oddly in Perl, so they are kept in this test --/
+
+/(\x{23a}\x{23a}\x{23a})?\1/8i
+    \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}
+No match
+
+/(ȺȺȺ)?\1/8i
+    ȺȺȺⱥⱥ
+No match
+
+/(\x{23a}\x{23a}\x{23a})?\1/8i
+    \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
+ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
+ 1: \x{23a}\x{23a}\x{23a}
+
+/(ȺȺȺ)?\1/8i
+    ȺȺȺⱥⱥⱥ
+ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
+ 1: \x{23a}\x{23a}\x{23a}
+
+/(\x{23a}\x{23a}\x{23a})\1/8i
+    \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}
+No match
+
+/(ȺȺȺ)\1/8i
+    ȺȺȺⱥⱥ
+No match
+
+/(\x{23a}\x{23a}\x{23a})\1/8i
+    \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
+ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
+ 1: \x{23a}\x{23a}\x{23a}
+
+/(ȺȺȺ)\1/8i
+    ȺȺȺⱥⱥⱥ
+ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
+ 1: \x{23a}\x{23a}\x{23a}
+
+/(\x{2c65}\x{2c65})\1/8i
+    \x{2c65}\x{2c65}\x{23a}\x{23a}
+ 0: \x{2c65}\x{2c65}\x{23a}\x{23a}
+ 1: \x{2c65}\x{2c65}
+    
+/(ⱥⱥ)\1/8i
+    ⱥⱥȺȺ 
+ 0: \x{2c65}\x{2c65}\x{23a}\x{23a}
+ 1: \x{2c65}\x{2c65}
+    
+/(\x{23a}\x{23a}\x{23a})\1Y/8i
+    X\x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}YZ
+ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}Y
+ 1: \x{23a}\x{23a}\x{23a}
+
+/(\x{2c65}\x{2c65})\1Y/8i
+    X\x{2c65}\x{2c65}\x{23a}\x{23a}YZ
+ 0: \x{2c65}\x{2c65}\x{23a}\x{23a}Y
+ 1: \x{2c65}\x{2c65}
+
+/-- --/ 
+
+ /-- End of testinput12 --/
+Index: testdata/testinput12
+===================================================================
+--- testdata/testinput12	(revision 594)
+++ testdata/testinput12	(revision 595)
+@@ -503,4 +503,44 @@
+ 
+ /A+\p{N}A+\dB+\p{N}*B+\d*/WBZ
+ 
+/-- These behaved oddly in Perl, so they are kept in this test --/
+
+/(\x{23a}\x{23a}\x{23a})?\1/8i
+    \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}
+
+/(ȺȺȺ)?\1/8i
+    ȺȺȺⱥⱥ
+
+/(\x{23a}\x{23a}\x{23a})?\1/8i
+    \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
+
+/(ȺȺȺ)?\1/8i
+    ȺȺȺⱥⱥⱥ
+
+/(\x{23a}\x{23a}\x{23a})\1/8i
+    \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}
+
+/(ȺȺȺ)\1/8i
+    ȺȺȺⱥⱥ
+
+/(\x{23a}\x{23a}\x{23a})\1/8i
+    \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
+
+/(ȺȺȺ)\1/8i
+    ȺȺȺⱥⱥⱥ
+
+/(\x{2c65}\x{2c65})\1/8i
+    \x{2c65}\x{2c65}\x{23a}\x{23a}
+    
+/(ⱥⱥ)\1/8i
+    ⱥⱥȺȺ 
+    
+/(\x{23a}\x{23a}\x{23a})\1Y/8i
+    X\x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}YZ
+
+/(\x{2c65}\x{2c65})\1Y/8i
+    X\x{2c65}\x{2c65}\x{23a}\x{23a}YZ
+
+/-- --/ 
+
+ /-- End of testinput12 --/
+Index: pcre_exec.c
+===================================================================
+--- pcre_exec.c	(revision 594)
+++ pcre_exec.c	(revision 595)
+@@ -132,24 +132,27 @@
+ *          Match a back-reference                *
+ *************************************************/
+ 
+-/* If a back reference hasn't been set, the length that is passed is greater
+-than the number of characters left in the string, so the match fails.
+/* Normally, if a back reference hasn't been set, the length that is passed is
+negative, so the match always fails. However, in JavaScript compatibility mode,
+the length passed is zero. Note that in caseless UTF-8 mode, the number of 
+subject bytes matched may be different to the number of reference bytes.
+ 
+ Arguments:
+   offset      index into the offset vector
+-  eptr        points into the subject
+-  length      length to be matched
+  eptr        pointer into the subject
+  length      length of reference to be matched (number of bytes)
+   md          points to match data block
+   ims         the ims flags
+ 
+-Returns:      TRUE if matched
+Returns:      < 0 if not matched, otherwise the number of subject bytes matched
+ */
+ 
+-static BOOL
+static int
+ match_ref(int offset, register USPTR eptr, int length, match_data *md,
+   unsigned long int ims)
+ {
+-USPTR p = md->start_subject + md->offset_vector[offset];
+USPTR eptr_start = eptr;
+register USPTR p = md->start_subject + md->offset_vector[offset];
+ 
+ #ifdef PCRE_DEBUG
+ if (eptr >= md->end_subject)
+@@ -164,9 +167,9 @@
+ printf("\n");
+ #endif
+ 
+-/* Always fail if not enough characters left */
+/* Always fail if reference not set (and not JavaScript compatible). */
+ 
+-if (length > md->end_subject - eptr) return FALSE;
+if (length < 0) return -1;
+ 
+ /* Separate the caseless case for speed. In UTF-8 mode we can only do this
+ properly if Unicode properties are supported. Otherwise, we can check only
+@@ -178,13 +181,21 @@
+ #ifdef SUPPORT_UCP
+   if (md->utf8)
+     {
+-    USPTR endptr = eptr + length;
+-    while (eptr < endptr)
+    /* Match characters up to the end of the reference. NOTE: the number of 
+    bytes matched may differ, because there are some characters whose upper and
+    lower case versions code as different numbers of bytes. For example, U+023A
+    (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
+    a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
+    the latter. It is important, therefore, to check the length along the 
+    reference, not along the subject (earlier code did this wrong). */
+ 
+    USPTR endptr = p + length;
+    while (p < endptr)
+       {
+       int c, d;
+       GETCHARINC(c, eptr);
+       GETCHARINC(d, p);
+-      if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
+      if (c != d && c != UCD_OTHERCASE(d)) return -1;
+       }
+     }
+   else
+@@ -195,16 +206,16 @@
+   is no UCP support. */
+ 
+   while (length-- > 0)
+-    { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
+    { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
+   }
+ 
+ /* In the caseful case, we can just compare the bytes, whether or not we
+ are in UTF-8 mode. */
+ 
+ else
+-  { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
+  { while (length-- > 0) if (*p++ != *eptr++) return -1; }
+ 
+-return TRUE;
+return eptr - eptr_start;
+ }
+ 
+ 
+@@ -2252,129 +2263,129 @@
+     loops). */
+ 
+     case OP_REF:
+-      {
+-      offset = GET2(ecode, 1) << 1;               /* Doubled ref number */
+-      ecode += 3;
+    offset = GET2(ecode, 1) << 1;               /* Doubled ref number */
+    ecode += 3;
+ 
+-      /* If the reference is unset, there are two possibilities:
+    /* If the reference is unset, there are two possibilities:
+ 
+-      (a) In the default, Perl-compatible state, set the length to be longer
+-      than the amount of subject left; this ensures that every attempt at a
+-      match fails. We can't just fail here, because of the possibility of
+-      quantifiers with zero minima.
+    (a) In the default, Perl-compatible state, set the length negative;
+    this ensures that every attempt at a match fails. We can't just fail
+    here, because of the possibility of quantifiers with zero minima.
+ 
+-      (b) If the JavaScript compatibility flag is set, set the length to zero
+-      so that the back reference matches an empty string.
+    (b) If the JavaScript compatibility flag is set, set the length to zero
+    so that the back reference matches an empty string.
+ 
+-      Otherwise, set the length to the length of what was matched by the
+-      referenced subpattern. */
+    Otherwise, set the length to the length of what was matched by the
+    referenced subpattern. */
+ 
+-      if (offset >= offset_top || md->offset_vector[offset] < 0)
+-        length = (md->jscript_compat)? 0 : (int)(md->end_subject - eptr + 1);
+-      else
+-        length = md->offset_vector[offset+1] - md->offset_vector[offset];
+    if (offset >= offset_top || md->offset_vector[offset] < 0)
+      length = (md->jscript_compat)? 0 : -1;
+    else
+      length = md->offset_vector[offset+1] - md->offset_vector[offset];
+ 
+-      /* Set up for repetition, or handle the non-repeated case */
+    /* Set up for repetition, or handle the non-repeated case */
+ 
+-      switch (*ecode)
+-        {
+-        case OP_CRSTAR:
+-        case OP_CRMINSTAR:
+-        case OP_CRPLUS:
+-        case OP_CRMINPLUS:
+-        case OP_CRQUERY:
+-        case OP_CRMINQUERY:
+-        c = *ecode++ - OP_CRSTAR;
+-        minimize = (c & 1) != 0;
+-        min = rep_min[c];                 /* Pick up values from tables; */
+-        max = rep_max[c];                 /* zero for max => infinity */
+-        if (max == 0) max = INT_MAX;
+-        break;
+    switch (*ecode)
+      {
+      case OP_CRSTAR:
+      case OP_CRMINSTAR:
+      case OP_CRPLUS:
+      case OP_CRMINPLUS:
+      case OP_CRQUERY:
+      case OP_CRMINQUERY:
+      c = *ecode++ - OP_CRSTAR;
+      minimize = (c & 1) != 0;
+      min = rep_min[c];                 /* Pick up values from tables; */
+      max = rep_max[c];                 /* zero for max => infinity */
+      if (max == 0) max = INT_MAX;
+      break;
+ 
+-        case OP_CRRANGE:
+-        case OP_CRMINRANGE:
+-        minimize = (*ecode == OP_CRMINRANGE);
+-        min = GET2(ecode, 1);
+-        max = GET2(ecode, 3);
+-        if (max == 0) max = INT_MAX;
+-        ecode += 5;
+-        break;
+      case OP_CRRANGE:
+      case OP_CRMINRANGE:
+      minimize = (*ecode == OP_CRMINRANGE);
+      min = GET2(ecode, 1);
+      max = GET2(ecode, 3);
+      if (max == 0) max = INT_MAX;
+      ecode += 5;
+      break;
+ 
+-        default:               /* No repeat follows */
+-        if (!match_ref(offset, eptr, length, md, ims))
+-          {
+-          CHECK_PARTIAL();
+-          MRRETURN(MATCH_NOMATCH);
+-          }
+-        eptr += length;
+-        continue;              /* With the main loop */
+      default:               /* No repeat follows */
+      if ((length = match_ref(offset, eptr, length, md, ims)) < 0)
+        {
+        CHECK_PARTIAL();
+        MRRETURN(MATCH_NOMATCH);
+         }
+      eptr += length;
+      continue;              /* With the main loop */
+      }
+ 
+-      /* If the length of the reference is zero, just continue with the
+-      main loop. */
+    /* Handle repeated back references. If the length of the reference is
+    zero, just continue with the main loop. */
+ 
+-      if (length == 0) continue;
+    if (length == 0) continue;
+ 
+-      /* First, ensure the minimum number of matches are present. We get back
+-      the length of the reference string explicitly rather than passing the
+-      address of eptr, so that eptr can be a register variable. */
+    /* First, ensure the minimum number of matches are present. We get back
+    the length of the reference string explicitly rather than passing the
+    address of eptr, so that eptr can be a register variable. */
+ 
+-      for (i = 1; i <= min; i++)
+    for (i = 1; i <= min; i++)
+      {
+      int slength; 
+      if ((slength = match_ref(offset, eptr, length, md, ims)) < 0)
+         {
+-        if (!match_ref(offset, eptr, length, md, ims))
+-          {
+-          CHECK_PARTIAL();
+-          MRRETURN(MATCH_NOMATCH);
+-          }
+-        eptr += length;
+        CHECK_PARTIAL();
+        MRRETURN(MATCH_NOMATCH);
+         }
+      eptr += slength;
+      }
+ 
+-      /* If min = max, continue at the same level without recursion.
+-      They are not both allowed to be zero. */
+    /* If min = max, continue at the same level without recursion.
+    They are not both allowed to be zero. */
+ 
+-      if (min == max) continue;
+    if (min == max) continue;
+ 
+-      /* If minimizing, keep trying and advancing the pointer */
+    /* If minimizing, keep trying and advancing the pointer */
+ 
+-      if (minimize)
+    if (minimize)
+      {
+      for (fi = min;; fi++)
+         {
+-        for (fi = min;; fi++)
+        int slength; 
+        RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
+        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+        if (fi >= max) MRRETURN(MATCH_NOMATCH);
+        if ((slength = match_ref(offset, eptr, length, md, ims)) < 0)
+           {
+-          RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
+-          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+-          if (fi >= max) MRRETURN(MATCH_NOMATCH);
+-          if (!match_ref(offset, eptr, length, md, ims))
+-            {
+-            CHECK_PARTIAL();
+-            MRRETURN(MATCH_NOMATCH);
+-            }
+-          eptr += length;
+          CHECK_PARTIAL();
+          MRRETURN(MATCH_NOMATCH);
+           }
+-        /* Control never gets here */
+        eptr += slength;
+         }
+      /* Control never gets here */
+      }
+ 
+-      /* If maximizing, find the longest string and work backwards */
+    /* If maximizing, find the longest string and work backwards */
+ 
+-      else
+    else
+      {
+      pp = eptr;
+      for (i = min; i < max; i++)
+         {
+-        pp = eptr;
+-        for (i = min; i < max; i++)
+        int slength; 
+        if ((slength = match_ref(offset, eptr, length, md, ims)) < 0)
+           {
+-          if (!match_ref(offset, eptr, length, md, ims))
+-            {
+-            CHECK_PARTIAL();
+-            break;
+-            }
+-          eptr += length;
+          CHECK_PARTIAL();
+          break;
+           }
+-        while (eptr >= pp)
+-          {
+-          RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
+-          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+-          eptr -= length;
+-          }
+-        MRRETURN(MATCH_NOMATCH);
+        eptr += slength;
+         }
+      while (eptr >= pp)
+        {
+        RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
+        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+        eptr -= length;
+        }
+      MRRETURN(MATCH_NOMATCH);
+       }
+     /* Control never gets here */
+ 
+Index: testdata/testinput1
+===================================================================
+--- testdata/testinput1	(revision 596)
+++ testdata/testinput1	(revision 597)
+@@ -4079,4 +4079,10 @@
+ /^\c/
+     ?
+ 
+/(abc)\1/i
+   abc
+
+/(abc)\1/
+   abc
+
+ /-- End of testinput1 --/
+Index: testdata/testoutput1
+===================================================================
+--- testdata/testoutput1	(revision 596)
+++ testdata/testoutput1	(revision 597)
+@@ -6666,4 +6666,12 @@
+     ?
+  0: ?
+ 
+/(abc)\1/i
+   abc
+No match
+
+/(abc)\1/
+   abc
+No match
+
+ /-- End of testinput1 --/
+Index: testdata/testinput4
+===================================================================
+--- testdata/testinput4	(revision 596)
+++ testdata/testinput4	(revision 597)
+@@ -644,4 +644,10 @@
+ /A*/g8
+     AAB\x{123}BAA
+ 
+/(abc)\1/8i
+   abc
+
+/(abc)\1/8
+   abc
+
+ /-- End of testinput4 --/
+Index: testdata/testoutput4
+===================================================================
+--- testdata/testoutput4	(revision 596)
+++ testdata/testoutput4	(revision 597)
+@@ -1128,4 +1128,12 @@
+  0: AA
+  0: 
+ 
+/(abc)\1/8i
+   abc
+No match
+
+/(abc)\1/8
+   abc
+No match
+
+ /-- End of testinput4 --/
+Index: pcre_exec.c
+===================================================================
+--- pcre_exec.c	(revision 596)
+++ pcre_exec.c	(revision 597)
+@@ -193,6 +193,7 @@
+     while (p < endptr)
+       {
+       int c, d;
+      if (eptr >= md->end_subject) return -1;
+       GETCHARINC(c, eptr);
+       GETCHARINC(d, p);
+       if (c != d && c != UCD_OTHERCASE(d)) return -1;
+@@ -204,16 +205,21 @@
+ 
+   /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
+   is no UCP support. */
+-
+-  while (length-- > 0)
+-    { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
+    {
+    if (eptr + length > md->end_subject) return -1; 
+    while (length-- > 0)
+      { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
+    }   
+   }
+ 
+ /* In the caseful case, we can just compare the bytes, whether or not we
+ are in UTF-8 mode. */
+ 
+ else
+-  { while (length-- > 0) if (*p++ != *eptr++) return -1; }
+  { 
+  if (eptr + length > md->end_subject) return -1; 
+  while (length-- > 0) if (*p++ != *eptr++) return -1; 
+  }
+ 
+ return eptr - eptr_start;
+ }
--- a/pcre.spec
+++ b/pcre.spec
@ -1,6 +1,6 @@
 Name: pcre
 Version: 8.12
-Release: 3%{?dist}
+Release: 4%{?dist}
 Summary: Perl-compatible regular expression library
 Group: System Environment/Libraries
 License: BSD
@ -11,6 +11,8 @@ Patch0: pcre-8.10-multilib.patch
 Patch1: pcre-8.12-manual_typos.patch
 # Refused by upstream, bug #675477
 Patch2: pcre-8.12-refused_spelling_terminated.patch
+# In upstream, bug #702623
+Patch3: pcre-8.12-caseless_reference.patch
 # New libtool to get rid of rpath
 BuildRequires: autoconf, automake, libtool

@ -44,6 +46,7 @@ Library for static linking for %{name}.
 libtoolize --copy --force && autoreconf
 %patch1 -p0 -b .manual_typos
 %patch2 -p1 -b .terminated_typos
+%patch3 -p0 -b .caseless_reference
 # One contributor's name is non-UTF-8
 for F in ChangeLog; do
    iconv -f latin1 -t utf8 "$F" >"${F}.utf8"
@ -103,6 +106,10 @@ make check
 %doc COPYING LICENCE 

 %changelog
+* Mon May 09 2011 Petr Pisar <ppisar@redhat.com> - 8.12-4
+- Fix caseless reference matching in UTF-8 mode when the upper/lower case
+  characters have different lengths (bug #702623)
+
 * Mon May 09 2011 Petr Pisar <ppisar@redhat.com> - 8.12-3
 - Fix typos in manual pages (bugs #675476, #675477)
 - Clean spec file up