Accept \N{U+hhhh} only in UTF mode
This commit is contained in:
parent
456a47d61f
commit
f7744b5476
@ -0,0 +1,258 @@
|
|||||||
|
From 11ad23b7e6f2b3b81bc7e1e605e34416b5b900ff Mon Sep 17 00:00:00 2001
|
||||||
|
From: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>
|
||||||
|
Date: Sun, 2 Sep 2018 16:03:27 +0000
|
||||||
|
Subject: [PATCH] Lock out \N{U+hhhh} in non-UTF (non-Unicode) modes.
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@994 6239d852-aaf2-0410-a92c-79f79f948069
|
||||||
|
Signed-off-by: Petr Písař <ppisar@redhat.com>
|
||||||
|
---
|
||||||
|
ChangeLog | 2 +-
|
||||||
|
doc/pcre2api.3 | 5 +++--
|
||||||
|
doc/pcre2pattern.3 | 16 +++++++++-------
|
||||||
|
doc/pcre2syntax.3 | 6 +++---
|
||||||
|
doc/pcre2unicode.3 | 13 +++++++++----
|
||||||
|
src/pcre2.h.in | 2 +-
|
||||||
|
src/pcre2_compile.c | 14 ++++++++++----
|
||||||
|
src/pcre2_error.c | 2 +-
|
||||||
|
testdata/testinput5 | 2 ++
|
||||||
|
testdata/testoutput5 | 3 +++
|
||||||
|
10 files changed, 42 insertions(+), 23 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/ChangeLog b/ChangeLog
|
||||||
|
index 264bfae..4b55639 100644
|
||||||
|
--- a/ChangeLog
|
||||||
|
+++ b/ChangeLog
|
||||||
|
@@ -130,7 +130,7 @@ present.
|
||||||
|
28. A (*MARK) name was not being passed back for positive assertions that were
|
||||||
|
terminated by (*ACCEPT).
|
||||||
|
|
||||||
|
-29. Add support for \N{U+dddd}, but not in EBCDIC environments.
|
||||||
|
+29. Add support for \N{U+dddd}, but only in Unicode mode.
|
||||||
|
|
||||||
|
30. Add support for (?^) for unsetting all imnsx options.
|
||||||
|
|
||||||
|
diff --git a/doc/pcre2api.3 b/doc/pcre2api.3
|
||||||
|
index 5a97739..fff4fcc 100644
|
||||||
|
--- a/doc/pcre2api.3
|
||||||
|
+++ b/doc/pcre2api.3
|
||||||
|
@@ -1,4 +1,4 @@
|
||||||
|
-.TH PCRE2API 3 "03 August 2018" "PCRE2 10.32"
|
||||||
|
+.TH PCRE2API 3 "02 September 2018" "PCRE2 10.32"
|
||||||
|
.SH NAME
|
||||||
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
|
.sp
|
||||||
|
@@ -1756,7 +1756,8 @@ behaviour of PCRE2 are given in the
|
||||||
|
.\" HREF
|
||||||
|
\fBpcre2unicode\fP
|
||||||
|
.\"
|
||||||
|
-page.
|
||||||
|
+page. In particular, note that it changes the way PCRE2_CASELESS handles
|
||||||
|
+characters with code points greater than 127.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.\" HTML <a name="extracompileoptions"></a>
|
||||||
|
diff --git a/doc/pcre2pattern.3 b/doc/pcre2pattern.3
|
||||||
|
index aa0d652..fd086b4 100644
|
||||||
|
--- a/doc/pcre2pattern.3
|
||||||
|
+++ b/doc/pcre2pattern.3
|
||||||
|
@@ -1,4 +1,4 @@
|
||||||
|
-.TH PCRE2PATTERN 3 "03 August 2018" "PCRE2 10.32"
|
||||||
|
+.TH PCRE2PATTERN 3 "02 September 2018" "PCRE2 10.32"
|
||||||
|
.SH NAME
|
||||||
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
|
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
|
||||||
|
@@ -376,14 +376,15 @@ these escapes are as follows:
|
||||||
|
\eddd character with octal code ddd, or backreference
|
||||||
|
\eo{ddd..} character with octal code ddd..
|
||||||
|
\exhh character with hex code hh
|
||||||
|
- \ex{hhh..} character with hex code hhh.. (default mode)
|
||||||
|
- \eN{U+hhh..} character with Unicode code point hhh..
|
||||||
|
+ \ex{hhh..} character with hex code hhh..
|
||||||
|
+ \eN{U+hhh..} character with Unicode hex code point hhh..
|
||||||
|
\euhhhh character with hex code hhhh (when PCRE2_ALT_BSUX is set)
|
||||||
|
.sp
|
||||||
|
+The \eN{U+hhh..} escape sequence is recognized only when the PCRE2_UTF option
|
||||||
|
+is set, that is, when PCRE2 is operating in a Unicode mode. Perl also uses
|
||||||
|
+\eN{name} to specify characters by Unicode name; PCRE2 does not support this.
|
||||||
|
Note that when \eN is not followed by an opening brace (curly bracket) it has
|
||||||
|
an entirely different meaning, matching any character that is not a newline.
|
||||||
|
-Perl also uses \eN{name} to specify characters by Unicode name; PCRE2 does not
|
||||||
|
-support this.
|
||||||
|
.P
|
||||||
|
The precise effect of \ecx on ASCII characters is as follows: if x is a lower
|
||||||
|
case letter, it is converted to upper case. Then bit 6 of the character (hex
|
||||||
|
@@ -509,7 +510,8 @@ limited to certain values, as follows:
|
||||||
|
Invalid Unicode code points are all those in the range 0xd800 to 0xdfff (the
|
||||||
|
so-called "surrogate" code points). The check for these can be disabled by the
|
||||||
|
caller of \fBpcre2_compile()\fP by setting the option
|
||||||
|
-PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES.
|
||||||
|
+PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES. However, this is possible only in UTF-8
|
||||||
|
+and UTF-32 modes, because these values are not representable in UTF-16.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS "Escape sequences in character classes"
|
||||||
|
@@ -3650,6 +3652,6 @@ Cambridge, England.
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
.nf
|
||||||
|
-Last updated: 03 August 2018
|
||||||
|
+Last updated: 02 September 2018
|
||||||
|
Copyright (c) 1997-2018 University of Cambridge.
|
||||||
|
.fi
|
||||||
|
diff --git a/doc/pcre2syntax.3 b/doc/pcre2syntax.3
|
||||||
|
index e5ace25..c392bfb 100644
|
||||||
|
--- a/doc/pcre2syntax.3
|
||||||
|
+++ b/doc/pcre2syntax.3
|
||||||
|
@@ -1,4 +1,4 @@
|
||||||
|
-.TH PCRE2SYNTAX 3 "01 August 2018" "PCRE2 10.32"
|
||||||
|
+.TH PCRE2SYNTAX 3 "02 September 2018" "PCRE2 10.32"
|
||||||
|
.SH NAME
|
||||||
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
|
.SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY"
|
||||||
|
@@ -35,7 +35,7 @@ This table applies to ASCII and Unicode environments.
|
||||||
|
\eddd character with octal code ddd, or backreference
|
||||||
|
\eo{ddd..} character with octal code ddd..
|
||||||
|
\eU "U" if PCRE2_ALT_BSUX is set (otherwise is an error)
|
||||||
|
- \eN{U+hh..} character with Unicode code point hh..
|
||||||
|
+ \eN{U+hh..} character with Unicode code point hh.. (Unicode mode only)
|
||||||
|
\euhhhh character with hex code hhhh (if PCRE2_ALT_BSUX is set)
|
||||||
|
\exhh character with hex code hh
|
||||||
|
\ex{hh..} character with hex code hh..
|
||||||
|
@@ -621,6 +621,6 @@ Cambridge, England.
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
.nf
|
||||||
|
-Last updated: 01 August 2018
|
||||||
|
+Last updated: 02 September 2018
|
||||||
|
Copyright (c) 1997-2018 University of Cambridge.
|
||||||
|
.fi
|
||||||
|
diff --git a/doc/pcre2unicode.3 b/doc/pcre2unicode.3
|
||||||
|
index fdaddc4..edd8dcd 100644
|
||||||
|
--- a/doc/pcre2unicode.3
|
||||||
|
+++ b/doc/pcre2unicode.3
|
||||||
|
@@ -1,4 +1,4 @@
|
||||||
|
-.TH PCRE2UNICODE 3 "17 May 2017" "PCRE2 10.30"
|
||||||
|
+.TH PCRE2UNICODE 3 "02 September 2018" "PCRE2 10.32"
|
||||||
|
.SH NAME
|
||||||
|
PCRE - Perl-compatible regular expressions (revised API)
|
||||||
|
.SH "UNICODE AND UTF SUPPORT"
|
||||||
|
@@ -16,7 +16,8 @@ you must call
|
||||||
|
with the PCRE2_UTF option flag, or the pattern must start with the sequence
|
||||||
|
(*UTF). When either of these is the case, both the pattern and any subject
|
||||||
|
strings that are matched against it are treated as UTF strings instead of
|
||||||
|
-strings of individual one-code-unit characters.
|
||||||
|
+strings of individual one-code-unit characters. There are also some other
|
||||||
|
+changes to the way characters are handled, as documented below.
|
||||||
|
.P
|
||||||
|
If you do not need Unicode support you can build PCRE2 without it, in which
|
||||||
|
case the library will be smaller.
|
||||||
|
@@ -51,6 +52,10 @@ unbraced hexadecimal escape sequences (for example, \ex{b3} or \exb3). Larger
|
||||||
|
values have to use braced sequences. Unbraced octal code points up to \e777 are
|
||||||
|
also recognized; larger ones can be coded using \eo{...}.
|
||||||
|
.P
|
||||||
|
+The escape sequence \eN{U+<hex digits>} is recognized as another way of
|
||||||
|
+specifying a Unicode character by code point in a UTF mode. It is not allowed
|
||||||
|
+in non-UTF modes.
|
||||||
|
+.P
|
||||||
|
In UTF modes, repeat quantifiers apply to complete UTF characters, not to
|
||||||
|
individual code units.
|
||||||
|
.P
|
||||||
|
@@ -280,6 +285,6 @@ Cambridge, England.
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
.nf
|
||||||
|
-Last updated: 17 May 2017
|
||||||
|
-Copyright (c) 1997-2017 University of Cambridge.
|
||||||
|
+Last updated: 02 September 2018
|
||||||
|
+Copyright (c) 1997-2018 University of Cambridge.
|
||||||
|
.fi
|
||||||
|
diff --git a/src/pcre2.h.in b/src/pcre2.h.in
|
||||||
|
index 4c7c674..a9396e0 100644
|
||||||
|
--- a/src/pcre2.h.in
|
||||||
|
+++ b/src/pcre2.h.in
|
||||||
|
@@ -316,7 +316,7 @@ pcre2_pattern_convert(). */
|
||||||
|
#define PCRE2_ERROR_INTERNAL_BAD_CODE_IN_SKIP 190
|
||||||
|
#define PCRE2_ERROR_NO_SURROGATES_IN_UTF16 191
|
||||||
|
#define PCRE2_ERROR_BAD_LITERAL_OPTIONS 192
|
||||||
|
-#define PCRE2_ERROR_NOT_SUPPORTED_IN_EBCDIC 193
|
||||||
|
+#define PCRE2_ERROR_SUPPORTED_ONLY_IN_UNICODE 193
|
||||||
|
#define PCRE2_ERROR_INVALID_HYPHEN_IN_OPTIONS 194
|
||||||
|
|
||||||
|
|
||||||
|
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
|
||||||
|
index 8c30064..f6a7e99 100644
|
||||||
|
--- a/src/pcre2_compile.c
|
||||||
|
+++ b/src/pcre2_compile.c
|
||||||
|
@@ -1454,16 +1454,22 @@ else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
|
||||||
|
/* \N{U+ can be handled by the \x{ code. However, this construction is
|
||||||
|
not valid in EBCDIC environments because it specifies a Unicode
|
||||||
|
character, not a codepoint in the local code. For example \N{U+0041}
|
||||||
|
- must be "A" in all environments. */
|
||||||
|
+ must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
|
||||||
|
+ casing semantics for the entire pattern, so allow it only in UTF (i.e.
|
||||||
|
+ Unicode) mode. */
|
||||||
|
|
||||||
|
if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
|
||||||
|
{
|
||||||
|
#ifdef EBCDIC
|
||||||
|
*errorcodeptr = ERR93;
|
||||||
|
#else
|
||||||
|
- ptr = p + 1;
|
||||||
|
- escape = 0; /* Not a fancy escape after all */
|
||||||
|
- goto COME_FROM_NU;
|
||||||
|
+ if (utf)
|
||||||
|
+ {
|
||||||
|
+ ptr = p + 1;
|
||||||
|
+ escape = 0; /* Not a fancy escape after all */
|
||||||
|
+ goto COME_FROM_NU;
|
||||||
|
+ }
|
||||||
|
+ else *errorcodeptr = ERR93;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
diff --git a/src/pcre2_error.c b/src/pcre2_error.c
|
||||||
|
index 64ab5d9..4b3b3f1 100644
|
||||||
|
--- a/src/pcre2_error.c
|
||||||
|
+++ b/src/pcre2_error.c
|
||||||
|
@@ -179,7 +179,7 @@ static const unsigned char compile_error_texts[] =
|
||||||
|
"internal error: bad code value in parsed_skip()\0"
|
||||||
|
"PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode\0"
|
||||||
|
"invalid option bits with PCRE2_LITERAL\0"
|
||||||
|
- "\\N{U+dddd} is not supported in EBCDIC mode\0"
|
||||||
|
+ "\\N{U+dddd} is supported only in Unicode (UTF) mode\0"
|
||||||
|
"invalid hyphen in option setting\0"
|
||||||
|
;
|
||||||
|
|
||||||
|
diff --git a/testdata/testinput5 b/testdata/testinput5
|
||||||
|
index e339808..687de32 100644
|
||||||
|
--- a/testdata/testinput5
|
||||||
|
+++ b/testdata/testinput5
|
||||||
|
@@ -2089,6 +2089,8 @@
|
||||||
|
|
||||||
|
/\N{U+}/
|
||||||
|
|
||||||
|
+/\N{U+}/utf
|
||||||
|
+
|
||||||
|
/\N{U}/
|
||||||
|
|
||||||
|
# This tests the non-UTF Unicode NEL pattern whitespace character, only
|
||||||
|
diff --git a/testdata/testoutput5 b/testdata/testoutput5
|
||||||
|
index a9552f4..51caa18 100644
|
||||||
|
--- a/testdata/testoutput5
|
||||||
|
+++ b/testdata/testoutput5
|
||||||
|
@@ -4751,6 +4751,9 @@ No match
|
||||||
|
0: \x{1d1aa}
|
||||||
|
|
||||||
|
/\N{U+}/
|
||||||
|
+Failed: error 193 at offset 2: \N{U+dddd} is supported only in Unicode (UTF) mode
|
||||||
|
+
|
||||||
|
+/\N{U+}/utf
|
||||||
|
Failed: error 178 at offset 5: digits missing in \x{} or \o{} or \N{U+}
|
||||||
|
|
||||||
|
/\N{U}/
|
||||||
|
--
|
||||||
|
2.14.4
|
||||||
|
|
@ -9,7 +9,7 @@
|
|||||||
%global rcversion RC1
|
%global rcversion RC1
|
||||||
Name: pcre2
|
Name: pcre2
|
||||||
Version: 10.32
|
Version: 10.32
|
||||||
Release: %{?rcversion:0.}2%{?rcversion:.%rcversion}%{?dist}
|
Release: %{?rcversion:0.}3%{?rcversion:.%rcversion}%{?dist}
|
||||||
%global myversion %{version}%{?rcversion:-%rcversion}
|
%global myversion %{version}%{?rcversion:-%rcversion}
|
||||||
Summary: Perl-compatible regular expression library
|
Summary: Perl-compatible regular expression library
|
||||||
# the library: BSD with exceptions
|
# the library: BSD with exceptions
|
||||||
@ -53,6 +53,9 @@ Patch0: pcre2-10.10-Fix-multilib.patch
|
|||||||
# 256 that is followed by a positive class with only characters less than 256,
|
# 256 that is followed by a positive class with only characters less than 256,
|
||||||
# upstream bug #2300, in upstream after 10.32-RC1
|
# upstream bug #2300, in upstream after 10.32-RC1
|
||||||
Patch1: pcre-10.32-RC1-Fix-bad-auto-possessification-of-certain-types-of-cl.patch
|
Patch1: pcre-10.32-RC1-Fix-bad-auto-possessification-of-certain-types-of-cl.patch
|
||||||
|
# Accept \N{U+hhhh} only in UTF mode, upstream bug #2305,
|
||||||
|
# in upstream after 10.32-RC1
|
||||||
|
Patch2: pcre2-10.32-RC1-Lock-out-N-U-hhhh-in-non-UTF-non-Unicode-modes.patch
|
||||||
BuildRequires: autoconf
|
BuildRequires: autoconf
|
||||||
BuildRequires: automake
|
BuildRequires: automake
|
||||||
BuildRequires: coreutils
|
BuildRequires: coreutils
|
||||||
@ -130,6 +133,7 @@ Utilities demonstrating PCRE2 capabilities like pcre2grep or pcre2test.
|
|||||||
%setup -q -n %{name}-%{myversion}
|
%setup -q -n %{name}-%{myversion}
|
||||||
%patch0 -p1
|
%patch0 -p1
|
||||||
%patch1 -p1
|
%patch1 -p1
|
||||||
|
%patch2 -p1
|
||||||
# Because of multilib patch
|
# Because of multilib patch
|
||||||
libtoolize --copy --force
|
libtoolize --copy --force
|
||||||
autoreconf -vif
|
autoreconf -vif
|
||||||
@ -232,6 +236,9 @@ make %{?_smp_mflags} check VERBOSE=yes
|
|||||||
%{_mandir}/man1/pcre2test.*
|
%{_mandir}/man1/pcre2test.*
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
* Mon Sep 03 2018 Petr Pisar <ppisar@redhat.com> - 10.32-0.3.RC1
|
||||||
|
- Accept \N{U+hhhh} only in UTF mode (upstream bug #2305)
|
||||||
|
|
||||||
* Mon Aug 20 2018 Petr Pisar <ppisar@redhat.com> - 10.32-0.2.RC1
|
* Mon Aug 20 2018 Petr Pisar <ppisar@redhat.com> - 10.32-0.2.RC1
|
||||||
- Fix autopossessifying a repeated negative class with no characters less than
|
- Fix autopossessifying a repeated negative class with no characters less than
|
||||||
256 that is followed by a positive class with only characters less than 256,
|
256 that is followed by a positive class with only characters less than 256,
|
||||||
|
Loading…
Reference in New Issue
Block a user