From f8bba868eadff9f1832864f3434e762f43e69733 Mon Sep 17 00:00:00 2001 From: Marek Polacek Date: Tue, 12 Jul 2022 15:12:18 -0400 Subject: [PATCH] Fix handling of invalid ranges in std::regex Resolves: #2106262 --- gcc.spec | 8 +- gcc11-rh2106262.patch | 388 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 395 insertions(+), 1 deletion(-) create mode 100644 gcc11-rh2106262.patch diff --git a/gcc.spec b/gcc.spec index 2313c45..b27d3f8 100644 --- a/gcc.spec +++ b/gcc.spec @@ -116,7 +116,7 @@ Summary: Various compilers (C, C++, Objective-C, ...) Name: gcc Version: %{gcc_version} -Release: %{gcc_release}%{?dist} +Release: %{gcc_release}.1%{?dist} # libgcc, libgfortran, libgomp, libstdc++ and crtstuff have # GCC Runtime Exception. License: GPLv3+ and GPLv3+ with exceptions and GPLv2+ with exceptions and LGPLv2+ and BSD @@ -274,6 +274,8 @@ Patch21: gcc11-dejagnu-multiline.patch Patch23: gcc11-pie.patch Patch24: gcc11-bind-now.patch Patch25: gcc11-pr105331.patch +# This can go once we rebase from GCC 11. +Patch26: gcc11-rh2106262.patch Patch100: gcc11-fortran-fdec-duplicates.patch Patch101: gcc11-fortran-flogical-as-integer.patch @@ -826,6 +828,7 @@ so that there cannot be any synchronization problems. %patch23 -p1 -b .pie~ %patch24 -p1 -b .now~ %patch25 -p0 -b .pr105331~ +%patch26 -p0 -b .rh2106262~ %if 0%{?rhel} >= 9 %patch100 -p1 -b .fortran-fdec-duplicates~ @@ -3271,6 +3274,9 @@ end %{ANNOBIN_GCC_PLUGIN_DIR}/gcc-annobin.so.0.0.0 %changelog +* Tue Jul 12 2022 Marek Polacek 11.3.1-2.1 +- fix handling of invalid ranges in std::regex (#2106262) + * Thu Apr 21 2022 Jakub Jelinek 11.3.1-2 - update from releases/gcc-11-branch (#2077536) - GCC 11.3 release diff --git a/gcc11-rh2106262.patch b/gcc11-rh2106262.patch new file mode 100644 index 0000000..34e9c5b --- /dev/null +++ b/gcc11-rh2106262.patch @@ -0,0 +1,388 @@ +commit c725028a8bb9478ec84332641147ad12b9236922 +Author: Jonathan Wakely +Date: Tue Dec 14 14:32:35 2021 +0000 + + libstdc++: Fix handling of invalid ranges in std::regex [PR102447] + + std::regex currently allows invalid bracket ranges such as [\w-a] which + are only allowed by ECMAScript when in web browser compatibility mode. + It should be an error, because the start of the range is a character + class, not a single character. The current implementation of + _Compiler::_M_expression_term does not provide a way to reject this, + because we only remember a previous character, not whether we just + processed a character class (or collating symbol etc.) + + This patch replaces the pair used to emulate + optional with a custom class closer to pair. That + allows us to track three states, so that we can tell when we've just + seen a character class. + + With this additional state the code in _M_expression_term for processing + the _S_token_bracket_dash can be improved to correctly reject the [\w-a] + case, without regressing for valid cases such as [\w-] and [----]. + + libstdc++-v3/ChangeLog: + + PR libstdc++/102447 + * include/bits/regex_compiler.h (_Compiler::_BracketState): New + class. + (_Compiler::_BrackeyMatcher): New alias template. + (_Compiler::_M_expression_term): Change pair + parameter to _BracketState. Process first character for + ECMAScript syntax as well as POSIX. + * include/bits/regex_compiler.tcc + (_Compiler::_M_insert_bracket_matcher): Pass _BracketState. + (_Compiler::_M_expression_term): Use _BracketState to store + state between calls. Improve handling of dashes in ranges. + * testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc: + Add more tests for ranges containing dashes. Check invalid + ranges with character class at the beginning. + + (cherry picked from commit 7ce3c230edf6e498e125c805a6dd313bf87dc439) + +diff --git a/libstdc++-v3/include/bits/regex_compiler.h b/libstdc++-v3/include/bits/regex_compiler.h +index f224fcb06e0..aa19df2bf9a 100644 +--- a/libstdc++-v3/include/bits/regex_compiler.h ++++ b/libstdc++-v3/include/bits/regex_compiler.h +@@ -122,13 +122,45 @@ namespace __detail + void + _M_insert_bracket_matcher(bool __neg); + +- // Returns true if successfully matched one term and should continue. ++ // Cache of the last atom seen in a bracketed range expression. ++ struct _BracketState ++ { ++ enum class _Type : char { _None, _Char, _Class } _M_type = _Type::_None; ++ _CharT _M_char; ++ ++ void ++ set(_CharT __c) noexcept { _M_type = _Type::_Char; _M_char = __c; } ++ ++ _GLIBCXX_NODISCARD _CharT ++ get() const noexcept { return _M_char; } ++ ++ void ++ reset(_Type __t = _Type::_None) noexcept { _M_type = __t; } ++ ++ explicit operator bool() const noexcept ++ { return _M_type != _Type::_None; } ++ ++ // Previous token was a single character. ++ _GLIBCXX_NODISCARD bool ++ _M_is_char() const noexcept { return _M_type == _Type::_Char; } ++ ++ // Previous token was a character class, equivalent class, ++ // collating symbol etc. ++ _GLIBCXX_NODISCARD bool ++ _M_is_class() const noexcept { return _M_type == _Type::_Class; } ++ }; ++ ++ template ++ using _BracketMatcher ++ = std::__detail::_BracketMatcher<_TraitsT, __icase, __collate>; ++ ++ // Returns true if successfully parsed one term and should continue ++ // compiling a bracket expression. + // Returns false if the compiler should move on. + template + bool +- _M_expression_term(pair& __last_char, +- _BracketMatcher<_TraitsT, __icase, __collate>& +- __matcher); ++ _M_expression_term(_BracketState& __last_char, ++ _BracketMatcher<__icase, __collate>& __matcher); + + int + _M_cur_int_value(int __radix); +diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc +index ea07bc2428e..7769a9e63a3 100644 +--- a/libstdc++-v3/include/bits/regex_compiler.tcc ++++ b/libstdc++-v3/include/bits/regex_compiler.tcc +@@ -403,7 +403,7 @@ namespace __detail + _M_insert_character_class_matcher() + { + __glibcxx_assert(_M_value.size() == 1); +- _BracketMatcher<_TraitsT, __icase, __collate> __matcher ++ _BracketMatcher<__icase, __collate> __matcher + (_M_ctype.is(_CtypeT::upper, _M_value[0]), _M_traits); + __matcher._M_add_character_class(_M_value, false); + __matcher._M_ready(); +@@ -424,25 +424,17 @@ namespace __detail + _Compiler<_TraitsT>:: + _M_insert_bracket_matcher(bool __neg) + { +- _BracketMatcher<_TraitsT, __icase, __collate> __matcher(__neg, _M_traits); +- pair __last_char; // Optional<_CharT> +- __last_char.first = false; +- if (!(_M_flags & regex_constants::ECMAScript)) +- { +- if (_M_try_char()) +- { +- __last_char.first = true; +- __last_char.second = _M_value[0]; +- } +- else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) +- { +- __last_char.first = true; +- __last_char.second = '-'; +- } +- } +- while (_M_expression_term(__last_char, __matcher)); +- if (__last_char.first) +- __matcher._M_add_char(__last_char.second); ++ _BracketMatcher<__icase, __collate> __matcher(__neg, _M_traits); ++ _BracketState __last_char; ++ if (_M_try_char()) ++ __last_char.set(_M_value[0]); ++ else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) ++ // Dash as first character is a normal character. ++ __last_char.set('-'); ++ while (_M_expression_term(__last_char, __matcher)) ++ ; ++ if (__last_char._M_is_char()) ++ __matcher._M_add_char(__last_char.get()); + __matcher._M_ready(); + _M_stack.push(_StateSeqT( + *_M_nfa, +@@ -447,27 +438,27 @@ namespace __detail + template + bool + _Compiler<_TraitsT>:: +- _M_expression_term(pair& __last_char, +- _BracketMatcher<_TraitsT, __icase, __collate>& __matcher) ++ _M_expression_term(_BracketState& __last_char, ++ _BracketMatcher<__icase, __collate>& __matcher) + { + if (_M_match_token(_ScannerT::_S_token_bracket_end)) + return false; + ++ // Add any previously cached char into the matcher and update cache. + const auto __push_char = [&](_CharT __ch) + { +- if (__last_char.first) +- __matcher._M_add_char(__last_char.second); +- else +- __last_char.first = true; +- __last_char.second = __ch; ++ if (__last_char._M_is_char()) ++ __matcher._M_add_char(__last_char.get()); ++ __last_char.set(__ch); + }; +- const auto __flush = [&] ++ // Add any previously cached char into the matcher and update cache. ++ const auto __push_class = [&] + { +- if (__last_char.first) +- { +- __matcher._M_add_char(__last_char.second); +- __last_char.first = false; +- } ++ if (__last_char._M_is_char()) ++ __matcher._M_add_char(__last_char.get()); ++ // We don't cache anything here, just record that the last thing ++ // processed was a character class (or similar). ++ __last_char.reset(_BracketState::_Type::_Class); + }; + + if (_M_match_token(_ScannerT::_S_token_collsymbol)) +@@ -476,16 +467,16 @@ namespace __detail + if (__symbol.size() == 1) + __push_char(__symbol[0]); + else +- __flush(); ++ __push_class(); + } + else if (_M_match_token(_ScannerT::_S_token_equiv_class_name)) + { +- __flush(); ++ __push_class(); + __matcher._M_add_equivalence_class(_M_value); + } + else if (_M_match_token(_ScannerT::_S_token_char_class_name)) + { +- __flush(); ++ __push_class(); + __matcher._M_add_character_class(_M_value, false); + } + else if (_M_try_char()) +@@ -502,49 +493,50 @@ namespace __detail + // It turns out that no one reads BNFs ;) + else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) + { +- if (!__last_char.first) ++ if (_M_match_token(_ScannerT::_S_token_bracket_end)) + { +- if (!(_M_flags & regex_constants::ECMAScript)) +- { +- if (_M_match_token(_ScannerT::_S_token_bracket_end)) +- { +- __push_char('-'); +- return false; +- } +- __throw_regex_error( +- regex_constants::error_range, +- "Unexpected dash in bracket expression. For POSIX syntax, " +- "a dash is not treated literally only when it is at " +- "beginning or end."); +- } ++ // For "-]" the dash is a literal character. + __push_char('-'); ++ return false; + } +- else ++ else if (__last_char._M_is_class()) ++ { ++ // "\\w-" is invalid, start of range must be a single char. ++ __throw_regex_error(regex_constants::error_range, ++ "Invalid start of range in bracket expression."); ++ } ++ else if (__last_char._M_is_char()) + { + if (_M_try_char()) + { +- __matcher._M_make_range(__last_char.second, _M_value[0]); +- __last_char.first = false; ++ // "x-y" ++ __matcher._M_make_range(__last_char.get(), _M_value[0]); ++ __last_char.reset(); + } + else if (_M_match_token(_ScannerT::_S_token_bracket_dash)) + { +- __matcher._M_make_range(__last_char.second, '-'); +- __last_char.first = false; ++ // "x--" ++ __matcher._M_make_range(__last_char.get(), '-'); ++ __last_char.reset(); + } + else +- { +- if (_M_scanner._M_get_token() +- != _ScannerT::_S_token_bracket_end) +- __throw_regex_error( +- regex_constants::error_range, +- "Character is expected after a dash."); +- __push_char('-'); +- } ++ __throw_regex_error(regex_constants::error_range, ++ "Invalid end of range in bracket expression."); + } ++ else if (_M_flags & regex_constants::ECMAScript) ++ { ++ // A dash that is not part of an existing range. Might be the ++ // start of a new range, or might just be a literal '-' char. ++ // Only ECMAScript allows that in the middle of a bracket expr. ++ __push_char('-'); ++ } ++ else ++ __throw_regex_error(regex_constants::error_range, ++ "Invalid dash in bracket expression."); + } + else if (_M_match_token(_ScannerT::_S_token_quoted_class)) + { +- __flush(); ++ __push_class(); + __matcher._M_add_character_class(_M_value, + _M_ctype.is(_CtypeT::upper, + _M_value[0])); +diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc +index 7df70604ea6..0d76e63da7b 100644 +--- a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc ++++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/cstring_bracket_01.cc +@@ -69,6 +69,16 @@ test01() + void + test02() + { ++ VERIFY(regex_match("-", regex("[-]", regex_constants::ECMAScript))); ++ VERIFY(regex_match("-", regex("[--]", regex_constants::ECMAScript))); ++ VERIFY(regex_match("-", regex("[---]", regex_constants::ECMAScript))); ++ VERIFY(regex_match("-", regex("[----]", regex_constants::ECMAScript))); ++ VERIFY(regex_match("-", regex("[-----]", regex_constants::ECMAScript))); ++ ++ VERIFY(regex_match("-", regex("[-]", regex_constants::extended))); ++ VERIFY(regex_match("-", regex("[--]", regex_constants::extended))); ++ VERIFY(regex_match("-", regex("[---]", regex_constants::extended))); ++ VERIFY(regex_match("-", regex("[----]", regex_constants::extended))); + try + { + std::regex re("[-----]", std::regex::extended); +@@ -78,7 +88,6 @@ test02() + { + VERIFY(e.code() == std::regex_constants::error_range); + } +- std::regex re("[-----]", std::regex::ECMAScript); + + VERIFY(!regex_match("b", regex("[-ac]", regex_constants::extended))); + VERIFY(!regex_match("b", regex("[ac-]", regex_constants::extended))); +@@ -93,7 +102,27 @@ test02() + } + catch (const std::regex_error& e) + { ++ VERIFY(e.code() == std::regex_constants::error_range); ++ } ++ try ++ { ++ regex("[@--]", regex_constants::extended); ++ VERIFY(false); + } ++ catch (const std::regex_error& e) ++ { ++ VERIFY(e.code() == std::regex_constants::error_range); ++ } ++ try ++ { ++ regex("[--%]", regex_constants::extended); ++ VERIFY(false); ++ } ++ catch (const std::regex_error& e) ++ { ++ VERIFY(e.code() == std::regex_constants::error_range); ++ } ++ + VERIFY(regex_match("].", regex("[][.hyphen.]-0]*", regex_constants::extended))); + } + +@@ -158,6 +187,36 @@ test06() + VERIFY(regex_match("a-", debian_cron_namespace_ok)); + } + ++// libstdc++/102447 ++void ++test07() ++{ ++ VERIFY(regex_match("-", std::regex("[\\w-]", std::regex::ECMAScript))); ++ VERIFY(regex_match("a", std::regex("[\\w-]", std::regex::ECMAScript))); ++ VERIFY(regex_match("-", std::regex("[a-]", std::regex::ECMAScript))); ++ VERIFY(regex_match("a", std::regex("[a-]", std::regex::ECMAScript))); ++ ++ try ++ { ++ std::regex re("[\\w-a]", std::regex::ECMAScript); ++ VERIFY(false); ++ } ++ catch (const std::regex_error& e) ++ { ++ VERIFY(e.code() == std::regex_constants::error_range); ++ } ++ ++ try ++ { ++ std::regex re("[\\w--]", std::regex::ECMAScript); ++ VERIFY(false); ++ } ++ catch (const std::regex_error& e) ++ { ++ VERIFY(e.code() == std::regex_constants::error_range); ++ } ++} ++ + int + main() + { +@@ -167,6 +226,7 @@ main() + test04(); + test05(); + test06(); ++ test07(); + + return 0; + }