commit ee3db7c8f844556d35a66b3732bad9f44a086491 Author: Jonathan Wakely Date: Mon Sep 27 20:44:24 2021 +0100 libstdc++: Fix handling of invalid ranges in std::regex [PR102447] std::regex currently allows invalid bracket ranges such as [\w-a] which are only allowed by ECMAScript when in web browser compatibility mode. It should be an error, because the start of the range is a character class, not a single character. The current implementation of _Compiler::_M_expression_term does not provide a way to reject this, because we only remember a previous character, not whether we just processed a character class (or collating symbol etc.) This patch replaces the pair used to emulate optional with a custom class closer to pair. That allows us to track three states, so that we can tell when we've just seen a character class. With this additional state the code in _M_expression_term for processing the _S_token_bracket_dash can be improved to correctly reject the [\w-a] case, without regressing for valid cases such as [\w-] and [----]. libstdc++-v3/ChangeLog: PR libstdc++/102447 * include/bits/regex_compiler.h (_Compiler::_BracketState): New class. (_Compiler::_BrackeyMatcher): New alias template. (_Compiler::_M_expression_term): Change pair parameter to _BracketState. Process first character for ECMAScript syntax as well as POSIX. * include/bits/regex_compiler.tcc (_Compiler::_M_insert_bracket_matcher): Pass _BracketState. (_Compiler::_M_expression_term): Use _BracketState to store state between calls. Improve handling of dashes in ranges. * testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc: Add more tests for ranges containing dashes. Check invalid ranges with character class at the beginning. (cherry picked from commit 7ce3c230edf6e498e125c805a6dd313bf87dc439) diff --git a/libstdc++-v3/include/bits/regex_compiler.h b/libstdc++-v3/include/bits/regex_compiler.h index 7e5c2073554..2eb1c3f7863 100644 --- a/libstdc++-v3/include/bits/regex_compiler.h +++ b/libstdc++-v3/include/bits/regex_compiler.h @@ -122,13 +122,45 @@ namespace __detail void _M_insert_bracket_matcher(bool __neg); - // Returns true if successfully matched one term and should continue. + // Cache of the last atom seen in a bracketed range expression. + struct _BracketState + { + enum class _Type : char { _None, _Char, _Class } _M_type = _Type::_None; + _CharT _M_char; + + void + set(_CharT __c) noexcept { _M_type = _Type::_Char; _M_char = __c; } + + _GLIBCXX_NODISCARD _CharT + get() const noexcept { return _M_char; } + + void + reset(_Type __t = _Type::_None) noexcept { _M_type = __t; } + + explicit operator bool() const noexcept + { return _M_type != _Type::_None; } + + // Previous token was a single character. + _GLIBCXX_NODISCARD bool + _M_is_char() const noexcept { return _M_type == _Type::_Char; } + + // Previous token was a character class, equivalent class, + // collating symbol etc. + _GLIBCXX_NODISCARD bool + _M_is_class() const noexcept { return _M_type == _Type::_Class; } + }; + + template + using _BracketMatcher + = std::__detail::_BracketMatcher<_TraitsT, __icase, __collate>; + + // Returns true if successfully parsed one term and should continue + // compiling a bracket expression. // Returns false if the compiler should move on. template bool - _M_expression_term(pair& __last_char, - _BracketMatcher<_TraitsT, __icase, __collate>& - __matcher); + _M_expression_term(_BracketState& __last_char, + _BracketMatcher<__icase, __collate>& __matcher); int _M_cur_int_value(int __radix); diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc index b1169428afb..5877d30ba52 100644 --- a/libstdc++-v3/include/bits/regex_compiler.tcc +++ b/libstdc++-v3/include/bits/regex_compiler.tcc @@ -140,7 +140,8 @@ namespace __detail return true; if (this->_M_atom()) { - while (this->_M_quantifier()); + while (this->_M_quantifier()) + ; return true; } return false; @@ -410,7 +411,7 @@ namespace __detail _M_insert_character_class_matcher() { __glibcxx_assert(_M_value.size() == 1); - _BracketMatcher<_TraitsT, __icase, __collate> __matcher + _BracketMatcher<__icase, __collate> __matcher (_M_ctype.is(_CtypeT::upper, _M_value[0]), _M_traits); __matcher._M_add_character_class(_M_value, false); __matcher._M_ready(); @@ -424,25 +425,17 @@ namespace __detail _Compiler<_TraitsT>:: _M_insert_bracket_matcher(bool __neg) { - _BracketMatcher<_TraitsT, __icase, __collate> __matcher(__neg, _M_traits); - pair __last_char; // Optional<_CharT> - __last_char.first = false; - if (!(_M_flags & regex_constants::ECMAScript)) - { - if (_M_try_char()) - { - __last_char.first = true; - __last_char.second = _M_value[0]; - } - else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) - { - __last_char.first = true; - __last_char.second = '-'; - } - } - while (_M_expression_term(__last_char, __matcher)); - if (__last_char.first) - __matcher._M_add_char(__last_char.second); + _BracketMatcher<__icase, __collate> __matcher(__neg, _M_traits); + _BracketState __last_char; + if (_M_try_char()) + __last_char.set(_M_value[0]); + else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) + // Dash as first character is a normal character. + __last_char.set('-'); + while (_M_expression_term(__last_char, __matcher)) + ; + if (__last_char._M_is_char()) + __matcher._M_add_char(__last_char.get()); __matcher._M_ready(); _M_stack.push(_StateSeqT( *_M_nfa, @@ -453,27 +446,27 @@ namespace __detail template bool _Compiler<_TraitsT>:: - _M_expression_term(pair& __last_char, - _BracketMatcher<_TraitsT, __icase, __collate>& __matcher) + _M_expression_term(_BracketState& __last_char, + _BracketMatcher<__icase, __collate>& __matcher) { if (_M_match_token(_ScannerT::_S_token_bracket_end)) return false; + // Add any previously cached char into the matcher and update cache. const auto __push_char = [&](_CharT __ch) { - if (__last_char.first) - __matcher._M_add_char(__last_char.second); - else - __last_char.first = true; - __last_char.second = __ch; + if (__last_char._M_is_char()) + __matcher._M_add_char(__last_char.get()); + __last_char.set(__ch); }; - const auto __flush = [&] + // Add any previously cached char into the matcher and update cache. + const auto __push_class = [&] { - if (__last_char.first) - { - __matcher._M_add_char(__last_char.second); - __last_char.first = false; - } + if (__last_char._M_is_char()) + __matcher._M_add_char(__last_char.get()); + // We don't cache anything here, just record that the last thing + // processed was a character class (or similar). + __last_char.reset(_BracketState::_Type::_Class); }; if (_M_match_token(_ScannerT::_S_token_collsymbol)) @@ -482,16 +475,16 @@ namespace __detail if (__symbol.size() == 1) __push_char(__symbol[0]); else - __flush(); + __push_class(); } else if (_M_match_token(_ScannerT::_S_token_equiv_class_name)) { - __flush(); + __push_class(); __matcher._M_add_equivalence_class(_M_value); } else if (_M_match_token(_ScannerT::_S_token_char_class_name)) { - __flush(); + __push_class(); __matcher._M_add_character_class(_M_value, false); } else if (_M_try_char()) @@ -508,49 +501,50 @@ namespace __detail // It turns out that no one reads BNFs ;) else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) { - if (!__last_char.first) + if (_M_match_token(_ScannerT::_S_token_bracket_end)) { - if (!(_M_flags & regex_constants::ECMAScript)) - { - if (_M_match_token(_ScannerT::_S_token_bracket_end)) - { - __push_char('-'); - return false; - } - __throw_regex_error( - regex_constants::error_range, - "Unexpected dash in bracket expression. For POSIX syntax, " - "a dash is not treated literally only when it is at " - "beginning or end."); - } + // For "-]" the dash is a literal character. __push_char('-'); + return false; } - else + else if (__last_char._M_is_class()) + { + // "\\w-" is invalid, start of range must be a single char. + __throw_regex_error(regex_constants::error_range, + "Invalid start of range in bracket expression."); + } + else if (__last_char._M_is_char()) { if (_M_try_char()) { - __matcher._M_make_range(__last_char.second, _M_value[0]); - __last_char.first = false; + // "x-y" + __matcher._M_make_range(__last_char.get(), _M_value[0]); + __last_char.reset(); } else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) { - __matcher._M_make_range(__last_char.second, '-'); - __last_char.first = false; + // "x--" + __matcher._M_make_range(__last_char.get(), '-'); + __last_char.reset(); } else - { - if (_M_scanner._M_get_token() - != _ScannerT::_S_token_bracket_end) - __throw_regex_error( - regex_constants::error_range, - "Character is expected after a dash."); - __push_char('-'); - } + __throw_regex_error(regex_constants::error_range, + "Invalid end of range in bracket expression."); } + else if (_M_flags & regex_constants::ECMAScript) + { + // A dash that is not part of an existing range. Might be the + // start of a new range, or might just be a literal '-' char. + // Only ECMAScript allows that in the middle of a bracket expr. + __push_char('-'); + } + else + __throw_regex_error(regex_constants::error_range, + "Invalid dash in bracket expression."); } else if (_M_match_token(_ScannerT::_S_token_quoted_class)) { - __flush(); + __push_class(); __matcher._M_add_character_class(_M_value, _M_ctype.is(_CtypeT::upper, _M_value[0])); diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc index 236ab663fc0..57088f5af83 100644 --- a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc +++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc @@ -68,6 +68,16 @@ test01() void test02() { + VERIFY(regex_match("-", regex("[-]", regex_constants::ECMAScript))); + VERIFY(regex_match("-", regex("[--]", regex_constants::ECMAScript))); + VERIFY(regex_match("-", regex("[---]", regex_constants::ECMAScript))); + VERIFY(regex_match("-", regex("[----]", regex_constants::ECMAScript))); + VERIFY(regex_match("-", regex("[-----]", regex_constants::ECMAScript))); + + VERIFY(regex_match("-", regex("[-]", regex_constants::extended))); + VERIFY(regex_match("-", regex("[--]", regex_constants::extended))); + VERIFY(regex_match("-", regex("[---]", regex_constants::extended))); + VERIFY(regex_match("-", regex("[----]", regex_constants::extended))); try { std::regex re("[-----]", std::regex::extended); @@ -77,7 +87,6 @@ test02() { VERIFY(e.code() == std::regex_constants::error_range); } - std::regex re("[-----]", std::regex::ECMAScript); VERIFY(!regex_match("b", regex("[-ac]", regex_constants::extended))); VERIFY(!regex_match("b", regex("[ac-]", regex_constants::extended))); @@ -92,7 +101,27 @@ test02() } catch (const std::regex_error& e) { + VERIFY(e.code() == std::regex_constants::error_range); + } + try + { + regex("[@--]", regex_constants::extended); + VERIFY(false); } + catch (const std::regex_error& e) + { + VERIFY(e.code() == std::regex_constants::error_range); + } + try + { + regex("[--%]", regex_constants::extended); + VERIFY(false); + } + catch (const std::regex_error& e) + { + VERIFY(e.code() == std::regex_constants::error_range); + } + VERIFY(regex_match("].", regex("[][.hyphen.]-0]*", regex_constants::extended))); } @@ -157,6 +186,36 @@ test06() VERIFY(regex_match("a-", debian_cron_namespace_ok)); } +// libstdc++/102447 +void +test07() +{ + VERIFY(regex_match("-", std::regex("[\\w-]", std::regex::ECMAScript))); + VERIFY(regex_match("a", std::regex("[\\w-]", std::regex::ECMAScript))); + VERIFY(regex_match("-", std::regex("[a-]", std::regex::ECMAScript))); + VERIFY(regex_match("a", std::regex("[a-]", std::regex::ECMAScript))); + + try + { + std::regex re("[\\w-a]", std::regex::ECMAScript); + VERIFY(false); + } + catch (const std::regex_error& e) + { + VERIFY(e.code() == std::regex_constants::error_range); + } + + try + { + std::regex re("[\\w--]", std::regex::ECMAScript); + VERIFY(false); + } + catch (const std::regex_error& e) + { + VERIFY(e.code() == std::regex_constants::error_range); + } +} + int main() { @@ -166,6 +225,7 @@ main() test04(); test05(); test06(); + test07(); return 0; } commit 1851cc4c5f2666dfdec53a2ada57095ffc59e08b Author: Jonathan Wakely Date: Mon Dec 13 13:36:33 2021 +0000 libstdc++: Fix non-reserved name in header libstdc++-v3/ChangeLog: * include/bits/regex_compiler.tcc (_Compiler::_M_match_token): Use reserved name for parameter. * testsuite/17_intro/names.cc: Check "token". (cherry picked from commit b0e6a257f1862e217cdf19332ea0f7bad56dcddc) diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc index 8af920e5fe9..b1169428afb 100644 --- a/libstdc++-v3/include/bits/regex_compiler.tcc +++ b/libstdc++-v3/include/bits/regex_compiler.tcc @@ -586,9 +586,9 @@ namespace __detail template bool _Compiler<_TraitsT>:: - _M_match_token(_TokenT token) + _M_match_token(_TokenT __token) { - if (token == _M_scanner._M_get_token()) + if (__token == _M_scanner._M_get_token()) { _M_value = _M_scanner._M_get_value(); _M_scanner._M_advance(); diff --git a/libstdc++-v3/testsuite/17_intro/names.cc b/libstdc++-v3/testsuite/17_intro/names.cc index d758138dfb1..6c06aba7228 100644 --- a/libstdc++-v3/testsuite/17_intro/names.cc +++ b/libstdc++-v3/testsuite/17_intro/names.cc @@ -99,6 +99,7 @@ #define z ( #define tmp ( +#define token ( #if __cplusplus < 201103L #define uses_allocator ( --- a/libstdc++-v3/include/bits/c++config.orig 2022-07-08 15:06:14.083231445 -0400 +++ b/libstdc++-v3/include/bits/c++config 2022-07-08 15:06:41.733247859 -0400 @@ -99,6 +99,12 @@ # define _GLIBCXX_ABI_TAG_CXX11 __attribute ((__abi_tag__ ("cxx11"))) #endif +// Macro to warn about unused results. +#if __cplusplus >= 201703L +# define _GLIBCXX_NODISCARD [[__nodiscard__]] +#else +# define _GLIBCXX_NODISCARD +#endif #if __cplusplus