Recognize all Unicode space characters with /x option in a pattern
This commit is contained in:
parent
bf7e53caa6
commit
0cdc995257
@ -0,0 +1,174 @@
|
|||||||
|
From 71340653cd375c01ada053c63d7c55d0ca475b47 Mon Sep 17 00:00:00 2001
|
||||||
|
From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
|
||||||
|
Date: Fri, 3 Aug 2018 09:38:36 +0000
|
||||||
|
Subject: [PATCH] Make /x more Perl-compatible by recognizing all of Unicode's
|
||||||
|
"Pattern White Space" characters, not just the ASCII ones.
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@977 6239d852-aaf2-0410-a92c-79f79f948069
|
||||||
|
|
||||||
|
Petr Písař: Ported to 10.31.
|
||||||
|
|
||||||
|
Signed-off-by: Petr Písař <ppisar@redhat.com>
|
||||||
|
---
|
||||||
|
src/pcre2_compile.c | 25 +++++++++++++++++++------
|
||||||
|
testdata/testinput4 | 15 +++++++++++++++
|
||||||
|
testdata/testinput5 | 13 +++++++++++++
|
||||||
|
testdata/testoutput4 | 18 ++++++++++++++++++
|
||||||
|
testdata/testoutput5 | 16 ++++++++++++++++
|
||||||
|
5 files changed, 81 insertions(+), 6 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
|
||||||
|
index 7ff8b4c..1d62a38 100644
|
||||||
|
--- a/src/pcre2_compile.c
|
||||||
|
+++ b/src/pcre2_compile.c
|
||||||
|
@@ -2434,11 +2434,17 @@ while (ptr < ptrend)
|
||||||
|
/* EITHER: not both options set */
|
||||||
|
((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
|
||||||
|
(PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
|
||||||
|
- /* OR: character > 255 */
|
||||||
|
- c > 255 ||
|
||||||
|
- /* OR: not a # comment or white space */
|
||||||
|
- (c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0)
|
||||||
|
- ))
|
||||||
|
+#ifdef SUPPORT_UNICODE
|
||||||
|
+ /* OR: character > 255 AND not Unicode Pattern White Space */
|
||||||
|
+ (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
|
||||||
|
+#endif
|
||||||
|
+ /* OR: not a # comment or isspace() white space */
|
||||||
|
+ (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
|
||||||
|
+#ifdef SUPPORT_UNICODE
|
||||||
|
+ /* and not CHAR_NEL when Unicode is supported */
|
||||||
|
+ && c != CHAR_NEL
|
||||||
|
+#endif
|
||||||
|
+ )))
|
||||||
|
{
|
||||||
|
PCRE2_SIZE verbnamelength;
|
||||||
|
|
||||||
|
@@ -2510,11 +2516,18 @@ while (ptr < ptrend)
|
||||||
|
|
||||||
|
/* Skip over whitespace and # comments in extended mode. Note that c is a
|
||||||
|
character, not a code unit, so we must not use MAX_255 to test its size
|
||||||
|
- because MAX_255 tests code units and is assumed TRUE in 8-bit mode. */
|
||||||
|
+ because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
|
||||||
|
+ whitespace characters are those designated as "Pattern White Space" by
|
||||||
|
+ Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
|
||||||
|
+ U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
|
||||||
|
+ subset of space characters that match \h and \v. */
|
||||||
|
|
||||||
|
if ((options & PCRE2_EXTENDED) != 0)
|
||||||
|
{
|
||||||
|
if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
|
||||||
|
+#ifdef SUPPORT_UNICODE
|
||||||
|
+ if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
|
||||||
|
+#endif
|
||||||
|
if (c == CHAR_NUMBER_SIGN)
|
||||||
|
{
|
||||||
|
while (ptr < ptrend)
|
||||||
|
diff --git a/testdata/testinput4 b/testdata/testinput4
|
||||||
|
index 0ef7b8e..6884f60 100644
|
||||||
|
--- a/testdata/testinput4
|
||||||
|
+++ b/testdata/testinput4
|
||||||
|
@@ -2300,5 +2300,20 @@
|
||||||
|
\x{123}\x{122}\x{123}
|
||||||
|
\= Expect no match
|
||||||
|
\x{123}\x{124}\x{123}
|
||||||
|
+
|
||||||
|
+# Test the full list of Unicode "Pattern White Space" characters that are to
|
||||||
|
+# be ignored by /x. The pattern lines below may show up oddly in text editors
|
||||||
|
+# or when listed to the screen. Note that characters such as U+2002, which are
|
||||||
|
+# matched as space by \h and \v are *not* "Pattern White Space".
|
||||||
|
+
|
||||||
|
+/A
B/x,utf
|
||||||
|
+ AB
|
||||||
|
+
|
||||||
|
+/A B/x,utf
|
||||||
|
+ A\x{2002}B
|
||||||
|
+\= Expect no match
|
||||||
|
+ AB
|
||||||
|
+
|
||||||
|
+# -------
|
||||||
|
|
||||||
|
# End of testinput4
|
||||||
|
diff --git a/testdata/testinput5 b/testdata/testinput5
|
||||||
|
index 0366136..ebeee07 100644
|
||||||
|
--- a/testdata/testinput5
|
||||||
|
+++ b/testdata/testinput5
|
||||||
|
@@ -2059,5 +2059,18 @@
|
||||||
|
\x{1F1E6}\x{1F1E7}\x{1F1E7}B
|
||||||
|
\x{1F1E6}\x{1F1E7}\x{1F1E7}\x{1F1E6}B
|
||||||
|
|
||||||
|
+# This tests the non-UTF Unicode NEL pattern whitespace character, only
|
||||||
|
+# recognized by PCRE2 with /x when there is Unicode support.
|
||||||
|
+
|
||||||
|
+/A
|
||||||
|
+
?B/x
|
||||||
|
+ AB
|
||||||
|
+
|
||||||
|
+# This tests Unicode Pattern White Space characters in verb names when they
|
||||||
|
+# are being processed with PCRE2_EXTENDED. Note: there are UTF-8 characters
|
||||||
|
+# with code points greater than 255 between A, B, and C in the pattern.
|
||||||
|
+
|
||||||
|
+/(*: AB
C)abc/x,utf,mark,alt_verbnames
|
||||||
|
+ abc
|
||||||
|
|
||||||
|
# End of testinput5
|
||||||
|
diff --git a/testdata/testoutput4 b/testdata/testoutput4
|
||||||
|
index 6056e6d..51c8219 100644
|
||||||
|
--- a/testdata/testoutput4
|
||||||
|
+++ b/testdata/testoutput4
|
||||||
|
@@ -3728,5 +3728,23 @@ No match
|
||||||
|
\= Expect no match
|
||||||
|
\x{123}\x{124}\x{123}
|
||||||
|
No match
|
||||||
|
+
|
||||||
|
+# Test the full list of Unicode "Pattern White Space" characters that are to
|
||||||
|
+# be ignored by /x. The pattern lines below may show up oddly in text editors
|
||||||
|
+# or when listed to the screen. Note that characters such as U+2002, which are
|
||||||
|
+# matched as space by \h and \v are *not* "Pattern White Space".
|
||||||
|
+
|
||||||
|
+/A
B/x,utf
|
||||||
|
+ AB
|
||||||
|
+ 0: AB
|
||||||
|
+
|
||||||
|
+/A B/x,utf
|
||||||
|
+ A\x{2002}B
|
||||||
|
+ 0: A\x{2002}B
|
||||||
|
+\= Expect no match
|
||||||
|
+ AB
|
||||||
|
+No match
|
||||||
|
+
|
||||||
|
+# -------
|
||||||
|
|
||||||
|
# End of testinput4
|
||||||
|
diff --git a/testdata/testoutput5 b/testdata/testoutput5
|
||||||
|
index 4b3171c..1392e98 100644
|
||||||
|
--- a/testdata/testoutput5
|
||||||
|
+++ b/testdata/testoutput5
|
||||||
|
@@ -4700,5 +4700,21 @@ Callout 0: last capture = 1
|
||||||
|
1: \x{1f1e6}\x{1f1e7}
|
||||||
|
2: \x{1f1e7}\x{1f1e6}
|
||||||
|
|
||||||
|
+# This tests the non-UTF Unicode NEL pattern whitespace character, only
|
||||||
|
+# recognized by PCRE2 with /x when there is Unicode support.
|
||||||
|
+
|
||||||
|
+/A
|
||||||
|
+
?B/x
|
||||||
|
+ AB
|
||||||
|
+ 0: AB
|
||||||
|
+
|
||||||
|
+# This tests Unicode Pattern White Space characters in verb names when they
|
||||||
|
+# are being processed with PCRE2_EXTENDED. Note: there are UTF-8 characters
|
||||||
|
+# with code points greater than 255 between A, B, and C in the pattern.
|
||||||
|
+
|
||||||
|
+/(*: AB
C)abc/x,utf,mark,alt_verbnames
|
||||||
|
+ abc
|
||||||
|
+ 0: abc
|
||||||
|
+MK: ABC
|
||||||
|
|
||||||
|
# End of testinput5
|
||||||
|
--
|
||||||
|
2.14.4
|
||||||
|
|
@ -9,7 +9,7 @@
|
|||||||
#%%global rcversion RC1
|
#%%global rcversion RC1
|
||||||
Name: pcre2
|
Name: pcre2
|
||||||
Version: 10.31
|
Version: 10.31
|
||||||
Release: %{?rcversion:0.}8%{?rcversion:.%rcversion}%{?dist}
|
Release: %{?rcversion:0.}9%{?rcversion:.%rcversion}%{?dist}
|
||||||
%global myversion %{version}%{?rcversion:-%rcversion}
|
%global myversion %{version}%{?rcversion:-%rcversion}
|
||||||
Summary: Perl-compatible regular expression library
|
Summary: Perl-compatible regular expression library
|
||||||
# the library: BSD with exceptions
|
# the library: BSD with exceptions
|
||||||
@ -85,6 +85,9 @@ Patch11: pcre2-10.31-Fix-bug-in-VERSION-number-reading.patch
|
|||||||
# Fix backtracking atomic groups when they are not separated by something with
|
# Fix backtracking atomic groups when they are not separated by something with
|
||||||
# a backtracking point, in upstream after 10.31
|
# a backtracking point, in upstream after 10.31
|
||||||
Patch12: pcre2-10.31-Fixed-atomic-group-backtracking-bug.patch
|
Patch12: pcre2-10.31-Fixed-atomic-group-backtracking-bug.patch
|
||||||
|
# Recognize all Unicode space characters with /x option in a pattern,
|
||||||
|
# in upstream after 10.31
|
||||||
|
Patch13: pcre2-10.31-Make-x-more-Perl-compatible-by-recognizing-all-of-Un.patch
|
||||||
BuildRequires: autoconf
|
BuildRequires: autoconf
|
||||||
BuildRequires: automake
|
BuildRequires: automake
|
||||||
BuildRequires: coreutils
|
BuildRequires: coreutils
|
||||||
@ -173,6 +176,7 @@ Utilities demonstrating PCRE2 capabilities like pcre2grep or pcre2test.
|
|||||||
%patch10 -p1
|
%patch10 -p1
|
||||||
%patch11 -p1
|
%patch11 -p1
|
||||||
%patch12 -p1
|
%patch12 -p1
|
||||||
|
%patch13 -p1
|
||||||
# Because of multilib patch
|
# Because of multilib patch
|
||||||
libtoolize --copy --force
|
libtoolize --copy --force
|
||||||
autoreconf -vif
|
autoreconf -vif
|
||||||
@ -275,6 +279,9 @@ make %{?_smp_mflags} check VERBOSE=yes
|
|||||||
%{_mandir}/man1/pcre2test.*
|
%{_mandir}/man1/pcre2test.*
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
* Thu Aug 16 2018 Petr Pisar <ppisar@redhat.com> - 10.31-9
|
||||||
|
- Recognize all Unicode space characters with /x option in a pattern
|
||||||
|
|
||||||
* Tue Jul 31 2018 Petr Pisar <ppisar@redhat.com> - 10.31-8
|
* Tue Jul 31 2018 Petr Pisar <ppisar@redhat.com> - 10.31-8
|
||||||
- Fix backtracking atomic groups when they are not separated by something with
|
- Fix backtracking atomic groups when they are not separated by something with
|
||||||
a backtracking point
|
a backtracking point
|
||||||
|
Loading…
Reference in New Issue
Block a user