From 54a4973709c4a6c5e87fdf658ed7e21198d0ee9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20P=C3=ADsa=C5=99?= Date: Mon, 9 May 2011 13:58:00 +0200 Subject: [PATCH] Fix caseless reference --- pcre-8.12-caseless_reference.patch | 552 +++++++++++++++++++++++++++++ pcre.spec | 9 +- 2 files changed, 560 insertions(+), 1 deletion(-) create mode 100644 pcre-8.12-caseless_reference.patch diff --git a/pcre-8.12-caseless_reference.patch b/pcre-8.12-caseless_reference.patch new file mode 100644 index 0000000..1ffd957 --- /dev/null +++ b/pcre-8.12-caseless_reference.patch @@ -0,0 +1,552 @@ +r595 | ph10 | 2011-05-02 12:33:29 +0200 (Po, 02 kvě 2011) | 3 lines +Fix problems with caseless reference matching in UTF-8 mode when the +upper/lower case characters have different lengths. + +and + +r597 | ph10 | 2011-05-02 19:08:52 +0200 (Po, 02 kvě 2011) | 2 lines +Complete incomplete fix for UTF-8 caseless references of different lengths. + +http://bugs.exim.org/show_bug.cgi?id=1074 + +Petr Pisar: Changelog and comment changes removed. + +Index: testdata/testoutput12 +=================================================================== +--- testdata/testoutput12 (revision 594) ++++ testdata/testoutput12 (revision 595) +@@ -1176,4 +1176,64 @@ + End + ------------------------------------------------------------------ + ++/-- These behaved oddly in Perl, so they are kept in this test --/ ++ ++/(\x{23a}\x{23a}\x{23a})?\1/8i ++ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65} ++No match ++ ++/(ȺȺȺ)?\1/8i ++ ȺȺȺⱥⱥ ++No match ++ ++/(\x{23a}\x{23a}\x{23a})?\1/8i ++ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65} ++ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65} ++ 1: \x{23a}\x{23a}\x{23a} ++ ++/(ȺȺȺ)?\1/8i ++ ȺȺȺⱥⱥⱥ ++ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65} ++ 1: \x{23a}\x{23a}\x{23a} ++ ++/(\x{23a}\x{23a}\x{23a})\1/8i ++ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65} ++No match ++ ++/(ȺȺȺ)\1/8i ++ ȺȺȺⱥⱥ ++No match ++ ++/(\x{23a}\x{23a}\x{23a})\1/8i ++ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65} ++ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65} ++ 1: \x{23a}\x{23a}\x{23a} ++ ++/(ȺȺȺ)\1/8i ++ ȺȺȺⱥⱥⱥ ++ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65} ++ 1: \x{23a}\x{23a}\x{23a} ++ ++/(\x{2c65}\x{2c65})\1/8i ++ \x{2c65}\x{2c65}\x{23a}\x{23a} ++ 0: \x{2c65}\x{2c65}\x{23a}\x{23a} ++ 1: \x{2c65}\x{2c65} ++ ++/(ⱥⱥ)\1/8i ++ ⱥⱥȺȺ ++ 0: \x{2c65}\x{2c65}\x{23a}\x{23a} ++ 1: \x{2c65}\x{2c65} ++ ++/(\x{23a}\x{23a}\x{23a})\1Y/8i ++ X\x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}YZ ++ 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}Y ++ 1: \x{23a}\x{23a}\x{23a} ++ ++/(\x{2c65}\x{2c65})\1Y/8i ++ X\x{2c65}\x{2c65}\x{23a}\x{23a}YZ ++ 0: \x{2c65}\x{2c65}\x{23a}\x{23a}Y ++ 1: \x{2c65}\x{2c65} ++ ++/-- --/ ++ + /-- End of testinput12 --/ +Index: testdata/testinput12 +=================================================================== +--- testdata/testinput12 (revision 594) ++++ testdata/testinput12 (revision 595) +@@ -503,4 +503,44 @@ + + /A+\p{N}A+\dB+\p{N}*B+\d*/WBZ + ++/-- These behaved oddly in Perl, so they are kept in this test --/ ++ ++/(\x{23a}\x{23a}\x{23a})?\1/8i ++ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65} ++ ++/(ȺȺȺ)?\1/8i ++ ȺȺȺⱥⱥ ++ ++/(\x{23a}\x{23a}\x{23a})?\1/8i ++ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65} ++ ++/(ȺȺȺ)?\1/8i ++ ȺȺȺⱥⱥⱥ ++ ++/(\x{23a}\x{23a}\x{23a})\1/8i ++ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65} ++ ++/(ȺȺȺ)\1/8i ++ ȺȺȺⱥⱥ ++ ++/(\x{23a}\x{23a}\x{23a})\1/8i ++ \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65} ++ ++/(ȺȺȺ)\1/8i ++ ȺȺȺⱥⱥⱥ ++ ++/(\x{2c65}\x{2c65})\1/8i ++ \x{2c65}\x{2c65}\x{23a}\x{23a} ++ ++/(ⱥⱥ)\1/8i ++ ⱥⱥȺȺ ++ ++/(\x{23a}\x{23a}\x{23a})\1Y/8i ++ X\x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}YZ ++ ++/(\x{2c65}\x{2c65})\1Y/8i ++ X\x{2c65}\x{2c65}\x{23a}\x{23a}YZ ++ ++/-- --/ ++ + /-- End of testinput12 --/ +Index: pcre_exec.c +=================================================================== +--- pcre_exec.c (revision 594) ++++ pcre_exec.c (revision 595) +@@ -132,24 +132,27 @@ + * Match a back-reference * + *************************************************/ + +-/* If a back reference hasn't been set, the length that is passed is greater +-than the number of characters left in the string, so the match fails. ++/* Normally, if a back reference hasn't been set, the length that is passed is ++negative, so the match always fails. However, in JavaScript compatibility mode, ++the length passed is zero. Note that in caseless UTF-8 mode, the number of ++subject bytes matched may be different to the number of reference bytes. + + Arguments: + offset index into the offset vector +- eptr points into the subject +- length length to be matched ++ eptr pointer into the subject ++ length length of reference to be matched (number of bytes) + md points to match data block + ims the ims flags + +-Returns: TRUE if matched ++Returns: < 0 if not matched, otherwise the number of subject bytes matched + */ + +-static BOOL ++static int + match_ref(int offset, register USPTR eptr, int length, match_data *md, + unsigned long int ims) + { +-USPTR p = md->start_subject + md->offset_vector[offset]; ++USPTR eptr_start = eptr; ++register USPTR p = md->start_subject + md->offset_vector[offset]; + + #ifdef PCRE_DEBUG + if (eptr >= md->end_subject) +@@ -164,9 +167,9 @@ + printf("\n"); + #endif + +-/* Always fail if not enough characters left */ ++/* Always fail if reference not set (and not JavaScript compatible). */ + +-if (length > md->end_subject - eptr) return FALSE; ++if (length < 0) return -1; + + /* Separate the caseless case for speed. In UTF-8 mode we can only do this + properly if Unicode properties are supported. Otherwise, we can check only +@@ -178,13 +181,21 @@ + #ifdef SUPPORT_UCP + if (md->utf8) + { +- USPTR endptr = eptr + length; +- while (eptr < endptr) ++ /* Match characters up to the end of the reference. NOTE: the number of ++ bytes matched may differ, because there are some characters whose upper and ++ lower case versions code as different numbers of bytes. For example, U+023A ++ (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8); ++ a sequence of 3 of the former uses 6 bytes, as does a sequence of two of ++ the latter. It is important, therefore, to check the length along the ++ reference, not along the subject (earlier code did this wrong). */ ++ ++ USPTR endptr = p + length; ++ while (p < endptr) + { + int c, d; + GETCHARINC(c, eptr); + GETCHARINC(d, p); +- if (c != d && c != UCD_OTHERCASE(d)) return FALSE; ++ if (c != d && c != UCD_OTHERCASE(d)) return -1; + } + } + else +@@ -195,16 +206,16 @@ + is no UCP support. */ + + while (length-- > 0) +- { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; } ++ { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; } + } + + /* In the caseful case, we can just compare the bytes, whether or not we + are in UTF-8 mode. */ + + else +- { while (length-- > 0) if (*p++ != *eptr++) return FALSE; } ++ { while (length-- > 0) if (*p++ != *eptr++) return -1; } + +-return TRUE; ++return eptr - eptr_start; + } + + +@@ -2252,129 +2263,129 @@ + loops). */ + + case OP_REF: +- { +- offset = GET2(ecode, 1) << 1; /* Doubled ref number */ +- ecode += 3; ++ offset = GET2(ecode, 1) << 1; /* Doubled ref number */ ++ ecode += 3; + +- /* If the reference is unset, there are two possibilities: ++ /* If the reference is unset, there are two possibilities: + +- (a) In the default, Perl-compatible state, set the length to be longer +- than the amount of subject left; this ensures that every attempt at a +- match fails. We can't just fail here, because of the possibility of +- quantifiers with zero minima. ++ (a) In the default, Perl-compatible state, set the length negative; ++ this ensures that every attempt at a match fails. We can't just fail ++ here, because of the possibility of quantifiers with zero minima. + +- (b) If the JavaScript compatibility flag is set, set the length to zero +- so that the back reference matches an empty string. ++ (b) If the JavaScript compatibility flag is set, set the length to zero ++ so that the back reference matches an empty string. + +- Otherwise, set the length to the length of what was matched by the +- referenced subpattern. */ ++ Otherwise, set the length to the length of what was matched by the ++ referenced subpattern. */ + +- if (offset >= offset_top || md->offset_vector[offset] < 0) +- length = (md->jscript_compat)? 0 : (int)(md->end_subject - eptr + 1); +- else +- length = md->offset_vector[offset+1] - md->offset_vector[offset]; ++ if (offset >= offset_top || md->offset_vector[offset] < 0) ++ length = (md->jscript_compat)? 0 : -1; ++ else ++ length = md->offset_vector[offset+1] - md->offset_vector[offset]; + +- /* Set up for repetition, or handle the non-repeated case */ ++ /* Set up for repetition, or handle the non-repeated case */ + +- switch (*ecode) +- { +- case OP_CRSTAR: +- case OP_CRMINSTAR: +- case OP_CRPLUS: +- case OP_CRMINPLUS: +- case OP_CRQUERY: +- case OP_CRMINQUERY: +- c = *ecode++ - OP_CRSTAR; +- minimize = (c & 1) != 0; +- min = rep_min[c]; /* Pick up values from tables; */ +- max = rep_max[c]; /* zero for max => infinity */ +- if (max == 0) max = INT_MAX; +- break; ++ switch (*ecode) ++ { ++ case OP_CRSTAR: ++ case OP_CRMINSTAR: ++ case OP_CRPLUS: ++ case OP_CRMINPLUS: ++ case OP_CRQUERY: ++ case OP_CRMINQUERY: ++ c = *ecode++ - OP_CRSTAR; ++ minimize = (c & 1) != 0; ++ min = rep_min[c]; /* Pick up values from tables; */ ++ max = rep_max[c]; /* zero for max => infinity */ ++ if (max == 0) max = INT_MAX; ++ break; + +- case OP_CRRANGE: +- case OP_CRMINRANGE: +- minimize = (*ecode == OP_CRMINRANGE); +- min = GET2(ecode, 1); +- max = GET2(ecode, 3); +- if (max == 0) max = INT_MAX; +- ecode += 5; +- break; ++ case OP_CRRANGE: ++ case OP_CRMINRANGE: ++ minimize = (*ecode == OP_CRMINRANGE); ++ min = GET2(ecode, 1); ++ max = GET2(ecode, 3); ++ if (max == 0) max = INT_MAX; ++ ecode += 5; ++ break; + +- default: /* No repeat follows */ +- if (!match_ref(offset, eptr, length, md, ims)) +- { +- CHECK_PARTIAL(); +- MRRETURN(MATCH_NOMATCH); +- } +- eptr += length; +- continue; /* With the main loop */ ++ default: /* No repeat follows */ ++ if ((length = match_ref(offset, eptr, length, md, ims)) < 0) ++ { ++ CHECK_PARTIAL(); ++ MRRETURN(MATCH_NOMATCH); + } ++ eptr += length; ++ continue; /* With the main loop */ ++ } + +- /* If the length of the reference is zero, just continue with the +- main loop. */ ++ /* Handle repeated back references. If the length of the reference is ++ zero, just continue with the main loop. */ + +- if (length == 0) continue; ++ if (length == 0) continue; + +- /* First, ensure the minimum number of matches are present. We get back +- the length of the reference string explicitly rather than passing the +- address of eptr, so that eptr can be a register variable. */ ++ /* First, ensure the minimum number of matches are present. We get back ++ the length of the reference string explicitly rather than passing the ++ address of eptr, so that eptr can be a register variable. */ + +- for (i = 1; i <= min; i++) ++ for (i = 1; i <= min; i++) ++ { ++ int slength; ++ if ((slength = match_ref(offset, eptr, length, md, ims)) < 0) + { +- if (!match_ref(offset, eptr, length, md, ims)) +- { +- CHECK_PARTIAL(); +- MRRETURN(MATCH_NOMATCH); +- } +- eptr += length; ++ CHECK_PARTIAL(); ++ MRRETURN(MATCH_NOMATCH); + } ++ eptr += slength; ++ } + +- /* If min = max, continue at the same level without recursion. +- They are not both allowed to be zero. */ ++ /* If min = max, continue at the same level without recursion. ++ They are not both allowed to be zero. */ + +- if (min == max) continue; ++ if (min == max) continue; + +- /* If minimizing, keep trying and advancing the pointer */ ++ /* If minimizing, keep trying and advancing the pointer */ + +- if (minimize) ++ if (minimize) ++ { ++ for (fi = min;; fi++) + { +- for (fi = min;; fi++) ++ int slength; ++ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14); ++ if (rrc != MATCH_NOMATCH) RRETURN(rrc); ++ if (fi >= max) MRRETURN(MATCH_NOMATCH); ++ if ((slength = match_ref(offset, eptr, length, md, ims)) < 0) + { +- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14); +- if (rrc != MATCH_NOMATCH) RRETURN(rrc); +- if (fi >= max) MRRETURN(MATCH_NOMATCH); +- if (!match_ref(offset, eptr, length, md, ims)) +- { +- CHECK_PARTIAL(); +- MRRETURN(MATCH_NOMATCH); +- } +- eptr += length; ++ CHECK_PARTIAL(); ++ MRRETURN(MATCH_NOMATCH); + } +- /* Control never gets here */ ++ eptr += slength; + } ++ /* Control never gets here */ ++ } + +- /* If maximizing, find the longest string and work backwards */ ++ /* If maximizing, find the longest string and work backwards */ + +- else ++ else ++ { ++ pp = eptr; ++ for (i = min; i < max; i++) + { +- pp = eptr; +- for (i = min; i < max; i++) ++ int slength; ++ if ((slength = match_ref(offset, eptr, length, md, ims)) < 0) + { +- if (!match_ref(offset, eptr, length, md, ims)) +- { +- CHECK_PARTIAL(); +- break; +- } +- eptr += length; ++ CHECK_PARTIAL(); ++ break; + } +- while (eptr >= pp) +- { +- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15); +- if (rrc != MATCH_NOMATCH) RRETURN(rrc); +- eptr -= length; +- } +- MRRETURN(MATCH_NOMATCH); ++ eptr += slength; + } ++ while (eptr >= pp) ++ { ++ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15); ++ if (rrc != MATCH_NOMATCH) RRETURN(rrc); ++ eptr -= length; ++ } ++ MRRETURN(MATCH_NOMATCH); + } + /* Control never gets here */ + +Index: testdata/testinput1 +=================================================================== +--- testdata/testinput1 (revision 596) ++++ testdata/testinput1 (revision 597) +@@ -4079,4 +4079,10 @@ + /^\c/ + ? + ++/(abc)\1/i ++ abc ++ ++/(abc)\1/ ++ abc ++ + /-- End of testinput1 --/ +Index: testdata/testoutput1 +=================================================================== +--- testdata/testoutput1 (revision 596) ++++ testdata/testoutput1 (revision 597) +@@ -6666,4 +6666,12 @@ + ? + 0: ? + ++/(abc)\1/i ++ abc ++No match ++ ++/(abc)\1/ ++ abc ++No match ++ + /-- End of testinput1 --/ +Index: testdata/testinput4 +=================================================================== +--- testdata/testinput4 (revision 596) ++++ testdata/testinput4 (revision 597) +@@ -644,4 +644,10 @@ + /A*/g8 + AAB\x{123}BAA + ++/(abc)\1/8i ++ abc ++ ++/(abc)\1/8 ++ abc ++ + /-- End of testinput4 --/ +Index: testdata/testoutput4 +=================================================================== +--- testdata/testoutput4 (revision 596) ++++ testdata/testoutput4 (revision 597) +@@ -1128,4 +1128,12 @@ + 0: AA + 0: + ++/(abc)\1/8i ++ abc ++No match ++ ++/(abc)\1/8 ++ abc ++No match ++ + /-- End of testinput4 --/ +Index: pcre_exec.c +=================================================================== +--- pcre_exec.c (revision 596) ++++ pcre_exec.c (revision 597) +@@ -193,6 +193,7 @@ + while (p < endptr) + { + int c, d; ++ if (eptr >= md->end_subject) return -1; + GETCHARINC(c, eptr); + GETCHARINC(d, p); + if (c != d && c != UCD_OTHERCASE(d)) return -1; +@@ -204,16 +205,21 @@ + + /* The same code works when not in UTF-8 mode and in UTF-8 mode when there + is no UCP support. */ +- +- while (length-- > 0) +- { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; } ++ { ++ if (eptr + length > md->end_subject) return -1; ++ while (length-- > 0) ++ { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; } ++ } + } + + /* In the caseful case, we can just compare the bytes, whether or not we + are in UTF-8 mode. */ + + else +- { while (length-- > 0) if (*p++ != *eptr++) return -1; } ++ { ++ if (eptr + length > md->end_subject) return -1; ++ while (length-- > 0) if (*p++ != *eptr++) return -1; ++ } + + return eptr - eptr_start; + } diff --git a/pcre.spec b/pcre.spec index 57b1344..5e6b42e 100644 --- a/pcre.spec +++ b/pcre.spec @@ -1,6 +1,6 @@ Name: pcre Version: 8.12 -Release: 3%{?dist} +Release: 4%{?dist} Summary: Perl-compatible regular expression library Group: System Environment/Libraries License: BSD @@ -11,6 +11,8 @@ Patch0: pcre-8.10-multilib.patch Patch1: pcre-8.12-manual_typos.patch # Refused by upstream, bug #675477 Patch2: pcre-8.12-refused_spelling_terminated.patch +# In upstream, bug #702623 +Patch3: pcre-8.12-caseless_reference.patch # New libtool to get rid of rpath BuildRequires: autoconf, automake, libtool @@ -44,6 +46,7 @@ Library for static linking for %{name}. libtoolize --copy --force && autoreconf %patch1 -p0 -b .manual_typos %patch2 -p1 -b .terminated_typos +%patch3 -p0 -b .caseless_reference # One contributor's name is non-UTF-8 for F in ChangeLog; do iconv -f latin1 -t utf8 "$F" >"${F}.utf8" @@ -103,6 +106,10 @@ make check %doc COPYING LICENCE %changelog +* Mon May 09 2011 Petr Pisar - 8.12-4 +- Fix caseless reference matching in UTF-8 mode when the upper/lower case + characters have different lengths (bug #702623) + * Mon May 09 2011 Petr Pisar - 8.12-3 - Fix typos in manual pages (bugs #675476, #675477) - Clean spec file up