From 8565d846f405861e3f6ae34914bb24b5e2002c45 Mon Sep 17 00:00:00 2001 From: ph10 Date: Wed, 3 Aug 2016 17:22:59 +0000 Subject: [PATCH] Fix bug that caused chars > 255 not to be matched by classes like [\W\pL] when PCRE2_UCP was not set. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ported to 10.22: commit 143f7f5c4dabd978117d415d2016c7595a7b9867 Author: ph10 Date: Wed Aug 3 17:22:59 2016 +0000 Fix bug that caused chars > 255 not to be matched by classes like [\W\pL] when PCRE2_UCP was not set. git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@554 6239d852-aaf2-0410-a92c-79f79f948069 Signed-off-by: Petr Písař --- src/pcre2_compile.c | 49 ++++++++++++++++++++++++++++++++++-------------- testdata/testinput10 | 9 +++++++++ testdata/testinput12 | 11 +++++++++++ testdata/testinput5 | 26 ++++++++++++++++--------- testdata/testoutput10 | 25 ++++++++++++++++++++++++ testdata/testoutput12-16 | 29 ++++++++++++++++++++++++++++ testdata/testoutput12-32 | 29 ++++++++++++++++++++++++++++ testdata/testoutput5 | 49 ++++++++++++++++++++++++------------------------ 8 files changed, 179 insertions(+), 48 deletions(-) diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index bb9736c..a92a69a 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -4950,11 +4950,11 @@ for (;; ptr++) } #ifdef SUPPORT_WIDE_CHARS - /* If any wide characters have been encountered, set xclass = TRUE. Then, - in the pre-compile phase, accumulate the length of the wide characters - and reset the pointer. This is so that very large classes that contain a - zillion wide characters do not overwrite the work space (which is on the - stack). */ + /* If any wide characters or Unicode properties have been encountered, + set xclass = TRUE. Then, in the pre-compile phase, accumulate the length + of the wide characters etc. and reset the pointer. This is so that very + large classes that contain a zillion wide characters do not overwrite the + work space (which is on the stack). */ if (class_uchardata > class_uchardata_base) { @@ -4994,22 +4994,43 @@ for (;; ptr++) negated). This requirement is indicated by match_all_or_no_wide_chars being true. We do this by including an explicit range, which works in both cases. + When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit + class where \S etc is present without PCRE2_UCP, causing an extended class + to be compiled, we make sure that all characters > 255 are included by + forcing match_all_or_no_wide_chars to be true. + If, when generating an xclass, there are no characters < 256, we can omit the bitmap in the actual compiled code. */ -#ifdef SUPPORT_WIDE_CHARS +#ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */ + if (xclass && ( #ifdef SUPPORT_UNICODE - if (xclass && (xclass_has_prop || !should_flip_negation || - (options & PCRE2_UCP) != 0)) -#elif PCRE2_CODE_UNIT_WIDTH != 8 - if (xclass && (xclass_has_prop || !should_flip_negation)) + (options & PCRE2_UCP) != 0 || #endif + xclass_has_prop || !should_flip_negation)) { - if (match_all_or_no_wide_chars) + if (match_all_or_no_wide_chars || ( +#if PCRE2_CODE_UNIT_WIDTH == 8 + utf && +#endif + should_flip_negation && !negate_class && (options & PCRE2_UCP) == 0)) { *class_uchardata++ = XCL_RANGE; - class_uchardata += PRIV(ord2utf)(0x100, class_uchardata); - class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata); + if (utf) /* Will always be utf in the 8-bit library */ + { + class_uchardata += PRIV(ord2utf)(0x100, class_uchardata); + class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata); + } + else /* Can only happen for the 16-bit & 32-bit libraries */ + { +#if PCRE2_CODE_UNIT_WIDTH == 16 + *class_uchardata++ = 0x100; + *class_uchardata++ = 0xffffu; +#elif PCRE2_CODE_UNIT_WIDTH == 32 + *class_uchardata++ = 0x100; + *class_uchardata++ = 0xffffffffu; +#endif + } } *class_uchardata++ = XCL_END; /* Marks the end of extra data */ *code++ = OP_XCLASS; @@ -5037,7 +5058,7 @@ for (;; ptr++) PUT(previous, 1, (int)(code - previous)); break; /* End of class handling */ } -#endif +#endif /* SUPPORT_WIDE_CHARS */ /* If there are no characters > 255, or they are all to be included or excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the diff --git a/testdata/testinput10 b/testdata/testinput10 index 550e1c9..4b80778 100644 --- a/testdata/testinput10 +++ b/testdata/testinput10 @@ -445,4 +445,13 @@ /(?<=(a)(?-1))x/I,utf a\x80zx\=offset=3 +/[\W\p{Any}]/B + abc + 123 + +/[\W\pL]/B + abc +\= Expect no match + 123 + # End of testinput10 diff --git a/testdata/testinput12 b/testdata/testinput12 index 14a7715..29934ec 100644 --- a/testdata/testinput12 +++ b/testdata/testinput12 @@ -343,4 +343,15 @@ /./utf \x{110000} +/[\W\p{Any}]/B + abc + 123 + +/[\W\pL]/B + abc + \x{100} + \x{308} +\= Expect no match + 123 + # End of testinput12 diff --git a/testdata/testinput5 b/testdata/testinput5 index 2e13a7c..1f44ceb 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -1675,15 +1675,6 @@ /((?\d)|(?\p{L}))/g,substitute_extended,replace=<${digit:+digit; :not digit; }${letter:+letter:not a letter}> ab12cde -/[\W\p{Any}]/B - abc - 123 - -/[\W\pL]/B - abc -\= Expect no match - 123 - /(*UCP)(*UTF)[[:>:]]X/B /abc/utf,replace=xyz @@ -1716,4 +1707,21 @@ /(*UTF)C\x09((? -/[\W\p{Any}]/B ------------------------------------------------------------------- - Bra - [\x00-/:-@[-^`{-\xff\p{Any}] - Ket - End ------------------------------------------------------------------- - abc - 0: a - 123 - 0: 1 - -/[\W\pL]/B ------------------------------------------------------------------- - Bra - [\x00-/:-@[-^`{-\xff\p{L}] - Ket - End ------------------------------------------------------------------- - abc - 0: a -\= Expect no match - 123 -No match - /(*UCP)(*UTF)[[:>:]]X/B ------------------------------------------------------------------ Bra @@ -4161,4 +4136,28 @@ No match /(*UTF)C\x09((?