pcre/pcre-8.12-caseless_reference.patch
2011-05-09 13:58:00 +02:00

553 lines
16 KiB
Diff
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

r595 | ph10 | 2011-05-02 12:33:29 +0200 (Po, 02 kvě 2011) | 3 lines
Fix problems with caseless reference matching in UTF-8 mode when the
upper/lower case characters have different lengths.
and
r597 | ph10 | 2011-05-02 19:08:52 +0200 (Po, 02 kvě 2011) | 2 lines
Complete incomplete fix for UTF-8 caseless references of different lengths.
http://bugs.exim.org/show_bug.cgi?id=1074
Petr Pisar: Changelog and comment changes removed.
Index: testdata/testoutput12
===================================================================
--- testdata/testoutput12 (revision 594)
+++ testdata/testoutput12 (revision 595)
@@ -1176,4 +1176,64 @@
End
------------------------------------------------------------------
+/-- These behaved oddly in Perl, so they are kept in this test --/
+
+/(\x{23a}\x{23a}\x{23a})?\1/8i
+ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}
+No match
+
+/(ȺȺȺ)?\1/8i
+ ȺȺȺⱥⱥ
+No match
+
+/(\x{23a}\x{23a}\x{23a})?\1/8i
+ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
+ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
+ 1: \x{23a}\x{23a}\x{23a}
+
+/(ȺȺȺ)?\1/8i
+ ȺȺȺⱥⱥⱥ
+ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
+ 1: \x{23a}\x{23a}\x{23a}
+
+/(\x{23a}\x{23a}\x{23a})\1/8i
+ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}
+No match
+
+/(ȺȺȺ)\1/8i
+ ȺȺȺⱥⱥ
+No match
+
+/(\x{23a}\x{23a}\x{23a})\1/8i
+ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
+ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
+ 1: \x{23a}\x{23a}\x{23a}
+
+/(ȺȺȺ)\1/8i
+ ȺȺȺⱥⱥⱥ
+ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
+ 1: \x{23a}\x{23a}\x{23a}
+
+/(\x{2c65}\x{2c65})\1/8i
+ \x{2c65}\x{2c65}\x{23a}\x{23a}
+ 0: \x{2c65}\x{2c65}\x{23a}\x{23a}
+ 1: \x{2c65}\x{2c65}
+
+/(ⱥⱥ)\1/8i
+ ⱥⱥȺȺ
+ 0: \x{2c65}\x{2c65}\x{23a}\x{23a}
+ 1: \x{2c65}\x{2c65}
+
+/(\x{23a}\x{23a}\x{23a})\1Y/8i
+ X\x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}YZ
+ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}Y
+ 1: \x{23a}\x{23a}\x{23a}
+
+/(\x{2c65}\x{2c65})\1Y/8i
+ X\x{2c65}\x{2c65}\x{23a}\x{23a}YZ
+ 0: \x{2c65}\x{2c65}\x{23a}\x{23a}Y
+ 1: \x{2c65}\x{2c65}
+
+/-- --/
+
/-- End of testinput12 --/
Index: testdata/testinput12
===================================================================
--- testdata/testinput12 (revision 594)
+++ testdata/testinput12 (revision 595)
@@ -503,4 +503,44 @@
/A+\p{N}A+\dB+\p{N}*B+\d*/WBZ
+/-- These behaved oddly in Perl, so they are kept in this test --/
+
+/(\x{23a}\x{23a}\x{23a})?\1/8i
+ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}
+
+/(ȺȺȺ)?\1/8i
+ ȺȺȺⱥⱥ
+
+/(\x{23a}\x{23a}\x{23a})?\1/8i
+ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
+
+/(ȺȺȺ)?\1/8i
+ ȺȺȺⱥⱥⱥ
+
+/(\x{23a}\x{23a}\x{23a})\1/8i
+ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}
+
+/(ȺȺȺ)\1/8i
+ ȺȺȺⱥⱥ
+
+/(\x{23a}\x{23a}\x{23a})\1/8i
+ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}
+
+/(ȺȺȺ)\1/8i
+ ȺȺȺⱥⱥⱥ
+
+/(\x{2c65}\x{2c65})\1/8i
+ \x{2c65}\x{2c65}\x{23a}\x{23a}
+
+/(ⱥⱥ)\1/8i
+ ⱥⱥȺȺ
+
+/(\x{23a}\x{23a}\x{23a})\1Y/8i
+ X\x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}YZ
+
+/(\x{2c65}\x{2c65})\1Y/8i
+ X\x{2c65}\x{2c65}\x{23a}\x{23a}YZ
+
+/-- --/
+
/-- End of testinput12 --/
Index: pcre_exec.c
===================================================================
--- pcre_exec.c (revision 594)
+++ pcre_exec.c (revision 595)
@@ -132,24 +132,27 @@
* Match a back-reference *
*************************************************/
-/* If a back reference hasn't been set, the length that is passed is greater
-than the number of characters left in the string, so the match fails.
+/* Normally, if a back reference hasn't been set, the length that is passed is
+negative, so the match always fails. However, in JavaScript compatibility mode,
+the length passed is zero. Note that in caseless UTF-8 mode, the number of
+subject bytes matched may be different to the number of reference bytes.
Arguments:
offset index into the offset vector
- eptr points into the subject
- length length to be matched
+ eptr pointer into the subject
+ length length of reference to be matched (number of bytes)
md points to match data block
ims the ims flags
-Returns: TRUE if matched
+Returns: < 0 if not matched, otherwise the number of subject bytes matched
*/
-static BOOL
+static int
match_ref(int offset, register USPTR eptr, int length, match_data *md,
unsigned long int ims)
{
-USPTR p = md->start_subject + md->offset_vector[offset];
+USPTR eptr_start = eptr;
+register USPTR p = md->start_subject + md->offset_vector[offset];
#ifdef PCRE_DEBUG
if (eptr >= md->end_subject)
@@ -164,9 +167,9 @@
printf("\n");
#endif
-/* Always fail if not enough characters left */
+/* Always fail if reference not set (and not JavaScript compatible). */
-if (length > md->end_subject - eptr) return FALSE;
+if (length < 0) return -1;
/* Separate the caseless case for speed. In UTF-8 mode we can only do this
properly if Unicode properties are supported. Otherwise, we can check only
@@ -178,13 +181,21 @@
#ifdef SUPPORT_UCP
if (md->utf8)
{
- USPTR endptr = eptr + length;
- while (eptr < endptr)
+ /* Match characters up to the end of the reference. NOTE: the number of
+ bytes matched may differ, because there are some characters whose upper and
+ lower case versions code as different numbers of bytes. For example, U+023A
+ (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
+ a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
+ the latter. It is important, therefore, to check the length along the
+ reference, not along the subject (earlier code did this wrong). */
+
+ USPTR endptr = p + length;
+ while (p < endptr)
{
int c, d;
GETCHARINC(c, eptr);
GETCHARINC(d, p);
- if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
+ if (c != d && c != UCD_OTHERCASE(d)) return -1;
}
}
else
@@ -195,16 +206,16 @@
is no UCP support. */
while (length-- > 0)
- { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
+ { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
}
/* In the caseful case, we can just compare the bytes, whether or not we
are in UTF-8 mode. */
else
- { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
+ { while (length-- > 0) if (*p++ != *eptr++) return -1; }
-return TRUE;
+return eptr - eptr_start;
}
@@ -2252,129 +2263,129 @@
loops). */
case OP_REF:
- {
- offset = GET2(ecode, 1) << 1; /* Doubled ref number */
- ecode += 3;
+ offset = GET2(ecode, 1) << 1; /* Doubled ref number */
+ ecode += 3;
- /* If the reference is unset, there are two possibilities:
+ /* If the reference is unset, there are two possibilities:
- (a) In the default, Perl-compatible state, set the length to be longer
- than the amount of subject left; this ensures that every attempt at a
- match fails. We can't just fail here, because of the possibility of
- quantifiers with zero minima.
+ (a) In the default, Perl-compatible state, set the length negative;
+ this ensures that every attempt at a match fails. We can't just fail
+ here, because of the possibility of quantifiers with zero minima.
- (b) If the JavaScript compatibility flag is set, set the length to zero
- so that the back reference matches an empty string.
+ (b) If the JavaScript compatibility flag is set, set the length to zero
+ so that the back reference matches an empty string.
- Otherwise, set the length to the length of what was matched by the
- referenced subpattern. */
+ Otherwise, set the length to the length of what was matched by the
+ referenced subpattern. */
- if (offset >= offset_top || md->offset_vector[offset] < 0)
- length = (md->jscript_compat)? 0 : (int)(md->end_subject - eptr + 1);
- else
- length = md->offset_vector[offset+1] - md->offset_vector[offset];
+ if (offset >= offset_top || md->offset_vector[offset] < 0)
+ length = (md->jscript_compat)? 0 : -1;
+ else
+ length = md->offset_vector[offset+1] - md->offset_vector[offset];
- /* Set up for repetition, or handle the non-repeated case */
+ /* Set up for repetition, or handle the non-repeated case */
- switch (*ecode)
- {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRPLUS:
- case OP_CRMINPLUS:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- c = *ecode++ - OP_CRSTAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
- break;
+ switch (*ecode)
+ {
+ case OP_CRSTAR:
+ case OP_CRMINSTAR:
+ case OP_CRPLUS:
+ case OP_CRMINPLUS:
+ case OP_CRQUERY:
+ case OP_CRMINQUERY:
+ c = *ecode++ - OP_CRSTAR;
+ minimize = (c & 1) != 0;
+ min = rep_min[c]; /* Pick up values from tables; */
+ max = rep_max[c]; /* zero for max => infinity */
+ if (max == 0) max = INT_MAX;
+ break;
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- minimize = (*ecode == OP_CRMINRANGE);
- min = GET2(ecode, 1);
- max = GET2(ecode, 3);
- if (max == 0) max = INT_MAX;
- ecode += 5;
- break;
+ case OP_CRRANGE:
+ case OP_CRMINRANGE:
+ minimize = (*ecode == OP_CRMINRANGE);
+ min = GET2(ecode, 1);
+ max = GET2(ecode, 3);
+ if (max == 0) max = INT_MAX;
+ ecode += 5;
+ break;
- default: /* No repeat follows */
- if (!match_ref(offset, eptr, length, md, ims))
- {
- CHECK_PARTIAL();
- MRRETURN(MATCH_NOMATCH);
- }
- eptr += length;
- continue; /* With the main loop */
+ default: /* No repeat follows */
+ if ((length = match_ref(offset, eptr, length, md, ims)) < 0)
+ {
+ CHECK_PARTIAL();
+ MRRETURN(MATCH_NOMATCH);
}
+ eptr += length;
+ continue; /* With the main loop */
+ }
- /* If the length of the reference is zero, just continue with the
- main loop. */
+ /* Handle repeated back references. If the length of the reference is
+ zero, just continue with the main loop. */
- if (length == 0) continue;
+ if (length == 0) continue;
- /* First, ensure the minimum number of matches are present. We get back
- the length of the reference string explicitly rather than passing the
- address of eptr, so that eptr can be a register variable. */
+ /* First, ensure the minimum number of matches are present. We get back
+ the length of the reference string explicitly rather than passing the
+ address of eptr, so that eptr can be a register variable. */
- for (i = 1; i <= min; i++)
+ for (i = 1; i <= min; i++)
+ {
+ int slength;
+ if ((slength = match_ref(offset, eptr, length, md, ims)) < 0)
{
- if (!match_ref(offset, eptr, length, md, ims))
- {
- CHECK_PARTIAL();
- MRRETURN(MATCH_NOMATCH);
- }
- eptr += length;
+ CHECK_PARTIAL();
+ MRRETURN(MATCH_NOMATCH);
}
+ eptr += slength;
+ }
- /* If min = max, continue at the same level without recursion.
- They are not both allowed to be zero. */
+ /* If min = max, continue at the same level without recursion.
+ They are not both allowed to be zero. */
- if (min == max) continue;
+ if (min == max) continue;
- /* If minimizing, keep trying and advancing the pointer */
+ /* If minimizing, keep trying and advancing the pointer */
- if (minimize)
+ if (minimize)
+ {
+ for (fi = min;; fi++)
{
- for (fi = min;; fi++)
+ int slength;
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max) MRRETURN(MATCH_NOMATCH);
+ if ((slength = match_ref(offset, eptr, length, md, ims)) < 0)
{
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max) MRRETURN(MATCH_NOMATCH);
- if (!match_ref(offset, eptr, length, md, ims))
- {
- CHECK_PARTIAL();
- MRRETURN(MATCH_NOMATCH);
- }
- eptr += length;
+ CHECK_PARTIAL();
+ MRRETURN(MATCH_NOMATCH);
}
- /* Control never gets here */
+ eptr += slength;
}
+ /* Control never gets here */
+ }
- /* If maximizing, find the longest string and work backwards */
+ /* If maximizing, find the longest string and work backwards */
- else
+ else
+ {
+ pp = eptr;
+ for (i = min; i < max; i++)
{
- pp = eptr;
- for (i = min; i < max; i++)
+ int slength;
+ if ((slength = match_ref(offset, eptr, length, md, ims)) < 0)
{
- if (!match_ref(offset, eptr, length, md, ims))
- {
- CHECK_PARTIAL();
- break;
- }
- eptr += length;
+ CHECK_PARTIAL();
+ break;
}
- while (eptr >= pp)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- eptr -= length;
- }
- MRRETURN(MATCH_NOMATCH);
+ eptr += slength;
}
+ while (eptr >= pp)
+ {
+ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ eptr -= length;
+ }
+ MRRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
Index: testdata/testinput1
===================================================================
--- testdata/testinput1 (revision 596)
+++ testdata/testinput1 (revision 597)
@@ -4079,4 +4079,10 @@
/^\c/
?
+/(abc)\1/i
+ abc
+
+/(abc)\1/
+ abc
+
/-- End of testinput1 --/
Index: testdata/testoutput1
===================================================================
--- testdata/testoutput1 (revision 596)
+++ testdata/testoutput1 (revision 597)
@@ -6666,4 +6666,12 @@
?
0: ?
+/(abc)\1/i
+ abc
+No match
+
+/(abc)\1/
+ abc
+No match
+
/-- End of testinput1 --/
Index: testdata/testinput4
===================================================================
--- testdata/testinput4 (revision 596)
+++ testdata/testinput4 (revision 597)
@@ -644,4 +644,10 @@
/A*/g8
AAB\x{123}BAA
+/(abc)\1/8i
+ abc
+
+/(abc)\1/8
+ abc
+
/-- End of testinput4 --/
Index: testdata/testoutput4
===================================================================
--- testdata/testoutput4 (revision 596)
+++ testdata/testoutput4 (revision 597)
@@ -1128,4 +1128,12 @@
0: AA
0:
+/(abc)\1/8i
+ abc
+No match
+
+/(abc)\1/8
+ abc
+No match
+
/-- End of testinput4 --/
Index: pcre_exec.c
===================================================================
--- pcre_exec.c (revision 596)
+++ pcre_exec.c (revision 597)
@@ -193,6 +193,7 @@
while (p < endptr)
{
int c, d;
+ if (eptr >= md->end_subject) return -1;
GETCHARINC(c, eptr);
GETCHARINC(d, p);
if (c != d && c != UCD_OTHERCASE(d)) return -1;
@@ -204,16 +205,21 @@
/* The same code works when not in UTF-8 mode and in UTF-8 mode when there
is no UCP support. */
-
- while (length-- > 0)
- { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
+ {
+ if (eptr + length > md->end_subject) return -1;
+ while (length-- > 0)
+ { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
+ }
}
/* In the caseful case, we can just compare the bytes, whether or not we
are in UTF-8 mode. */
else
- { while (length-- > 0) if (*p++ != *eptr++) return -1; }
+ {
+ if (eptr + length > md->end_subject) return -1;
+ while (length-- > 0) if (*p++ != *eptr++) return -1;
+ }
return eptr - eptr_start;
}