271 lines
13 KiB
Diff
271 lines
13 KiB
Diff
From 10ce49389ea9ee26a3b02b6494b0a3849d56c6fa Mon Sep 17 00:00:00 2001
|
||
From: Yves Orton <demerphq@gmail.com>
|
||
Date: Mon, 26 Jun 2017 13:19:55 +0200
|
||
Subject: [PATCH] fix #131649 - extended charclass can trigger assert
|
||
|
||
The extended charclass parser makes some assumptions during the
|
||
first pass which are only true on well structured input, and it
|
||
does not properly catch various errors. later on the code assumes
|
||
that things the first pass will let through are valid, when in
|
||
fact they should trigger errors.
|
||
|
||
(cherry picked from commit 19a498a461d7c81ae3507c450953d1148efecf4f)
|
||
---
|
||
pod/perldiag.pod | 27 ++++++++++++++++++++++++++-
|
||
pod/perlrecharclass.pod | 4 ++--
|
||
regcomp.c | 28 ++++++++++++++++++----------
|
||
t/lib/warnings/regcomp | 6 +++---
|
||
t/re/reg_mesg.t | 29 ++++++++++++++++-------------
|
||
t/re/regex_sets.t | 6 +++---
|
||
6 files changed, 68 insertions(+), 32 deletions(-)
|
||
|
||
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
|
||
index 106fe41121..c29925a2a4 100644
|
||
--- a/pod/perldiag.pod
|
||
+++ b/pod/perldiag.pod
|
||
@@ -5904,7 +5904,7 @@ yourself.
|
||
a perl4 interpreter, especially if the next 2 tokens are "use strict"
|
||
or "my $var" or "our $var".
|
||
|
||
-=item Syntax error in (?[...]) in regex m/%s/
|
||
+=item Syntax error in (?[...]) in regex; marked by <-- HERE in m/%s/
|
||
|
||
(F) Perl could not figure out what you meant inside this construct; this
|
||
notifies you that it is giving up trying.
|
||
@@ -6402,6 +6402,31 @@ to find out why that isn't happening.
|
||
(F) The unexec() routine failed for some reason. See your local FSF
|
||
representative, who probably put it there in the first place.
|
||
|
||
+=item Unexpected ']' with no following ')' in (?[... in regex; marked by <-- HERE in m/%s/
|
||
+
|
||
+(F) While parsing an extended character class a ']' character was encountered
|
||
+at a point in the definition where the only legal use of ']' is to close the
|
||
+character class definition as part of a '])', you may have forgotten the close
|
||
+paren, or otherwise confused the parser.
|
||
+
|
||
+=item Expecting close paren for nested extended charclass in regex; marked by <-- HERE in m/%s/
|
||
+
|
||
+(F) While parsing a nested extended character class like:
|
||
+
|
||
+ (?[ ... (?flags:(?[ ... ])) ... ])
|
||
+ ^
|
||
+
|
||
+we expected to see a close paren ')' (marked by ^) but did not.
|
||
+
|
||
+=item Expecting close paren for wrapper for nested extended charclass in regex; marked by <-- HERE in m/%s/
|
||
+
|
||
+(F) While parsing a nested extended character class like:
|
||
+
|
||
+ (?[ ... (?flags:(?[ ... ])) ... ])
|
||
+ ^
|
||
+
|
||
+we expected to see a close paren ')' (marked by ^) but did not.
|
||
+
|
||
=item Unexpected binary operator '%c' with no preceding operand in regex;
|
||
marked by S<<-- HERE> in m/%s/
|
||
|
||
diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod
|
||
index 79480e4131..8c008507d1 100644
|
||
--- a/pod/perlrecharclass.pod
|
||
+++ b/pod/perlrecharclass.pod
|
||
@@ -1128,8 +1128,8 @@ hence both of the following work:
|
||
Any contained POSIX character classes, including things like C<\w> and C<\D>
|
||
respect the C<E<sol>a> (and C<E<sol>aa>) modifiers.
|
||
|
||
-C<< (?[ ]) >> is a regex-compile-time construct. Any attempt to use
|
||
-something which isn't knowable at the time the containing regular
|
||
+Note that C<< (?[ ]) >> is a regex-compile-time construct. Any attempt
|
||
+to use something which isn't knowable at the time the containing regular
|
||
expression is compiled is a fatal error. In practice, this means
|
||
just three limitations:
|
||
|
||
diff --git a/regcomp.c b/regcomp.c
|
||
index 4ee48ede42..ddac290d2b 100644
|
||
--- a/regcomp.c
|
||
+++ b/regcomp.c
|
||
@@ -14840,8 +14840,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
|
||
TRUE /* Force /x */ );
|
||
|
||
switch (*RExC_parse) {
|
||
- case '?':
|
||
- if (RExC_parse[1] == '[') depth++, RExC_parse++;
|
||
+ case '(':
|
||
+ if (RExC_parse[1] == '?' && RExC_parse[2] == '[')
|
||
+ depth++, RExC_parse+=2;
|
||
/* FALLTHROUGH */
|
||
default:
|
||
break;
|
||
@@ -14898,9 +14899,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
|
||
}
|
||
|
||
case ']':
|
||
- if (depth--) break;
|
||
- RExC_parse++;
|
||
- if (*RExC_parse == ')') {
|
||
+ if (RExC_parse[1] == ')') {
|
||
+ RExC_parse++;
|
||
+ if (depth--) break;
|
||
node = reganode(pRExC_state, ANYOF, 0);
|
||
RExC_size += ANYOF_SKIP;
|
||
nextchar(pRExC_state);
|
||
@@ -14912,20 +14913,25 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
|
||
|
||
return node;
|
||
}
|
||
- goto no_close;
|
||
+ /* We output the messages even if warnings are off, because we'll fail
|
||
+ * the very next thing, and these give a likely diagnosis for that */
|
||
+ if (posix_warnings && av_tindex_nomg(posix_warnings) >= 0) {
|
||
+ output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
|
||
+ }
|
||
+ RExC_parse++;
|
||
+ vFAIL("Unexpected ']' with no following ')' in (?[...");
|
||
}
|
||
|
||
RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
|
||
}
|
||
|
||
- no_close:
|
||
/* We output the messages even if warnings are off, because we'll fail
|
||
* the very next thing, and these give a likely diagnosis for that */
|
||
if (posix_warnings && av_tindex_nomg(posix_warnings) >= 0) {
|
||
output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
|
||
}
|
||
|
||
- FAIL("Syntax error in (?[...])");
|
||
+ vFAIL("Syntax error in (?[...])");
|
||
}
|
||
|
||
/* Pass 2 only after this. */
|
||
@@ -15105,12 +15111,14 @@ redo_curchar:
|
||
* inversion list, and RExC_parse points to the trailing
|
||
* ']'; the next character should be the ')' */
|
||
RExC_parse++;
|
||
- assert(UCHARAT(RExC_parse) == ')');
|
||
+ if (UCHARAT(RExC_parse) != ')')
|
||
+ vFAIL("Expecting close paren for nested extended charclass");
|
||
|
||
/* Then the ')' matching the original '(' handled by this
|
||
* case: statement */
|
||
RExC_parse++;
|
||
- assert(UCHARAT(RExC_parse) == ')');
|
||
+ if (UCHARAT(RExC_parse) != ')')
|
||
+ vFAIL("Expecting close paren for wrapper for nested extended charclass");
|
||
|
||
RExC_parse++;
|
||
RExC_flags = save_flags;
|
||
diff --git a/t/lib/warnings/regcomp b/t/lib/warnings/regcomp
|
||
index 2b084c59b0..51ad57ccbe 100644
|
||
--- a/t/lib/warnings/regcomp
|
||
+++ b/t/lib/warnings/regcomp
|
||
@@ -59,21 +59,21 @@ Unmatched [ in regex; marked by <-- HERE in m/abc[ <-- HERE fi[.00./ at - line
|
||
qr/(?[[[:word]]])/;
|
||
EXPECT
|
||
Assuming NOT a POSIX class since there is no terminating ':' in regex; marked by <-- HERE in m/(?[[[:word <-- HERE ]]])/ at - line 2.
|
||
-syntax error in (?[...]) in regex m/(?[[[:word]]])/ at - line 2.
|
||
+Unexpected ']' with no following ')' in (?[... in regex; marked by <-- HERE in m/(?[[[:word]] <-- HERE ])/ at - line 2.
|
||
########
|
||
# NAME qr/(?[ [[:digit: ])/
|
||
# OPTION fatal
|
||
qr/(?[[[:digit: ])/;
|
||
EXPECT
|
||
Assuming NOT a POSIX class since no blanks are allowed in one in regex; marked by <-- HERE in m/(?[[[:digit: ] <-- HERE )/ at - line 2.
|
||
-syntax error in (?[...]) in regex m/(?[[[:digit: ])/ at - line 2.
|
||
+syntax error in (?[...]) in regex; marked by <-- HERE in m/(?[[[:digit: ]) <-- HERE / at - line 2.
|
||
########
|
||
# NAME qr/(?[ [:digit: ])/
|
||
# OPTION fatal
|
||
qr/(?[[:digit: ])/
|
||
EXPECT
|
||
Assuming NOT a POSIX class since no blanks are allowed in one in regex; marked by <-- HERE in m/(?[[:digit: ] <-- HERE )/ at - line 2.
|
||
-syntax error in (?[...]) in regex m/(?[[:digit: ])/ at - line 2.
|
||
+syntax error in (?[...]) in regex; marked by <-- HERE in m/(?[[:digit: ]) <-- HERE / at - line 2.
|
||
########
|
||
# NAME [perl #126141]
|
||
# OPTION fatal
|
||
diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t
|
||
index d26a7caf37..5194d93751 100644
|
||
--- a/t/re/reg_mesg.t
|
||
+++ b/t/re/reg_mesg.t
|
||
@@ -215,8 +215,9 @@ my @death =
|
||
'/\b{gc}/' => "'gc' is an unknown bound type {#} m/\\b{gc{#}}/",
|
||
'/\B{gc}/' => "'gc' is an unknown bound type {#} m/\\B{gc{#}}/",
|
||
|
||
- '/(?[[[::]]])/' => "Syntax error in (?[...]) in regex m/(?[[[::]]])/",
|
||
- '/(?[[[:w:]]])/' => "Syntax error in (?[...]) in regex m/(?[[[:w:]]])/",
|
||
+
|
||
+ '/(?[[[::]]])/' => "Unexpected ']' with no following ')' in (?[... {#} m/(?[[[::]]{#}])/",
|
||
+ '/(?[[[:w:]]])/' => "Unexpected ']' with no following ')' in (?[... {#} m/(?[[[:w:]]{#}])/",
|
||
'/(?[[:w:]])/' => "",
|
||
'/([.].*)[.]/' => "", # [perl #127582]
|
||
'/[.].*[.]/' => "", # [perl #127604]
|
||
@@ -239,11 +240,12 @@ my @death =
|
||
'/(?[ \p{foo} ])/' => 'Can\'t find Unicode property definition "foo" {#} m/(?[ \p{foo}{#} ])/',
|
||
'/(?[ \p{ foo = bar } ])/' => 'Can\'t find Unicode property definition "foo = bar" {#} m/(?[ \p{ foo = bar }{#} ])/',
|
||
'/(?[ \8 ])/' => 'Unrecognized escape \8 in character class {#} m/(?[ \8{#} ])/',
|
||
- '/(?[ \t ]/' => 'Syntax error in (?[...]) in regex m/(?[ \t ]/',
|
||
- '/(?[ [ \t ]/' => 'Syntax error in (?[...]) in regex m/(?[ [ \t ]/',
|
||
- '/(?[ \t ] ]/' => 'Syntax error in (?[...]) in regex m/(?[ \t ] ]/',
|
||
- '/(?[ [ ] ]/' => 'Syntax error in (?[...]) in regex m/(?[ [ ] ]/',
|
||
- '/(?[ \t + \e # This was supposed to be a comment ])/' => 'Syntax error in (?[...]) in regex m/(?[ \t + \e # This was supposed to be a comment ])/',
|
||
+ '/(?[ \t ]/' => "Unexpected ']' with no following ')' in (?[... {#} m/(?[ \\t ]{#}/",
|
||
+ '/(?[ [ \t ]/' => "Syntax error in (?[...]) {#} m/(?[ [ \\t ]{#}/",
|
||
+ '/(?[ \t ] ]/' => "Unexpected ']' with no following ')' in (?[... {#} m/(?[ \\t ]{#} ]/",
|
||
+ '/(?[ [ ] ]/' => "Syntax error in (?[...]) {#} m/(?[ [ ] ]{#}/",
|
||
+ '/(?[ \t + \e # This was supposed to be a comment ])/' =>
|
||
+ "Syntax error in (?[...]) {#} m/(?[ \\t + \\e # This was supposed to be a comment ]){#}/",
|
||
'/(?[ ])/' => 'Incomplete expression within \'(?[ ])\' {#} m/(?[ {#}])/',
|
||
'm/(?[[a-\d]])/' => 'False [] range "a-\d" {#} m/(?[[a-\d{#}]])/',
|
||
'm/(?[[\w-x]])/' => 'False [] range "\w-" {#} m/(?[[\w-{#}x]])/',
|
||
@@ -431,10 +433,10 @@ my @death_utf8 = mark_as_utf8(
|
||
|
||
'/ネ\p{}ネ/' => 'Empty \p{} {#} m/ネ\p{{#}}ネ/',
|
||
|
||
- '/ネ(?[[[:ネ]]])ネ/' => "Syntax error in (?[...]) in regex m/ネ(?[[[:ネ]]])ネ/",
|
||
- '/ネ(?[[[:ネ: ])ネ/' => "Syntax error in (?[...]) in regex m/ネ(?[[[:ネ: ])ネ/",
|
||
- '/ネ(?[[[::]]])ネ/' => "Syntax error in (?[...]) in regex m/ネ(?[[[::]]])ネ/",
|
||
- '/ネ(?[[[:ネ:]]])ネ/' => "Syntax error in (?[...]) in regex m/ネ(?[[[:ネ:]]])ネ/",
|
||
+ '/ネ(?[[[:ネ]]])ネ/' => "Unexpected ']' with no following ')' in (?[... {#} m/ネ(?[[[:ネ]]{#}])ネ/",
|
||
+ '/ネ(?[[[:ネ: ])ネ/' => "Syntax error in (?[...]) {#} m/ネ(?[[[:ネ: ])ネ{#}/",
|
||
+ '/ネ(?[[[::]]])ネ/' => "Unexpected ']' with no following ')' in (?[... {#} m/ネ(?[[[::]]{#}])ネ/",
|
||
+ '/ネ(?[[[:ネ:]]])ネ/' => "Unexpected ']' with no following ')' in (?[... {#} m/ネ(?[[[:ネ:]]{#}])ネ/",
|
||
'/ネ(?[[:ネ:]])ネ/' => "",
|
||
'/ネ(?[ネ])ネ/' => 'Unexpected character {#} m/ネ(?[ネ{#}])ネ/',
|
||
'/ネ(?[ + [ネ] ])/' => 'Unexpected binary operator \'+\' with no preceding operand {#} m/ネ(?[ +{#} [ネ] ])/',
|
||
@@ -447,8 +449,9 @@ my @death_utf8 = mark_as_utf8(
|
||
'/(?[ \x{ネ} ])ネ/' => 'Non-hex character {#} m/(?[ \x{ネ{#}} ])ネ/',
|
||
'/(?[ \p{ネ} ])/' => 'Can\'t find Unicode property definition "ネ" {#} m/(?[ \p{ネ}{#} ])/',
|
||
'/(?[ \p{ ネ = bar } ])/' => 'Can\'t find Unicode property definition "ネ = bar" {#} m/(?[ \p{ ネ = bar }{#} ])/',
|
||
- '/ネ(?[ \t ]/' => 'Syntax error in (?[...]) in regex m/ネ(?[ \t ]/',
|
||
- '/(?[ \t + \e # ネ This was supposed to be a comment ])/' => 'Syntax error in (?[...]) in regex m/(?[ \t + \e # ネ This was supposed to be a comment ])/',
|
||
+ '/ネ(?[ \t ]/' => "Unexpected ']' with no following ')' in (?[... {#} m/ネ(?[ \\t ]{#}/",
|
||
+ '/(?[ \t + \e # ネ This was supposed to be a comment ])/' =>
|
||
+ "Syntax error in (?[...]) {#} m/(?[ \\t + \\e # ネ This was supposed to be a comment ]){#}/",
|
||
'm/(*ネ)ネ/' => q<Unknown verb pattern 'ネ' {#} m/(*ネ){#}ネ/>,
|
||
'/\cネ/' => "Character following \"\\c\" must be printable ASCII",
|
||
'/\b{ネ}/' => "'ネ' is an unknown bound type {#} m/\\b{ネ{#}}/",
|
||
diff --git a/t/re/regex_sets.t b/t/re/regex_sets.t
|
||
index 6a79f9d692..e9644bd4e6 100644
|
||
--- a/t/re/regex_sets.t
|
||
+++ b/t/re/regex_sets.t
|
||
@@ -158,13 +158,13 @@ for my $char ("٠", "٥", "٩") {
|
||
eval { $_ = '/(?[(\c]) /'; qr/$_/ };
|
||
like($@, qr/^Syntax error/, '/(?[(\c]) / should not panic');
|
||
eval { $_ = '(?[\c#]' . "\n])"; qr/$_/ };
|
||
- like($@, qr/^Syntax error/, '/(?[(\c]) / should not panic');
|
||
+ like($@, qr/^Unexpected/, '/(?[(\c]) / should not panic');
|
||
eval { $_ = '(?[(\c])'; qr/$_/ };
|
||
like($@, qr/^Syntax error/, '/(?[(\c])/ should be a syntax error');
|
||
eval { $_ = '(?[(\c]) ]\b'; qr/$_/ };
|
||
- like($@, qr/^Syntax error/, '/(?[(\c]) ]\b/ should be a syntax error');
|
||
+ like($@, qr/^Unexpected/, '/(?[(\c]) ]\b/ should be a syntax error');
|
||
eval { $_ = '(?[\c[]](])'; qr/$_/ };
|
||
- like($@, qr/^Syntax error/, '/(?[\c[]](])/ should be a syntax error');
|
||
+ like($@, qr/^Unexpected/, '/(?[\c[]](])/ should be a syntax error');
|
||
like("\c#", qr/(?[\c#])/, '\c# should match itself');
|
||
like("\c[", qr/(?[\c[])/, '\c[ should match itself');
|
||
like("\c\ ", qr/(?[\c\])/, '\c\ should match itself');
|
||
--
|
||
2.11.0
|
||
|