From 11ccefabe3cadf5e55c45ee897647c618522bf61 Mon Sep 17 00:00:00 2001 From: Jarek Prokop Date: Fri, 7 Jun 2024 15:06:51 +0200 Subject: [PATCH] Fix REXML DoS parsing an XML with many `<`s in an attribute value (CVE-2024-35176). Resolves: RHEL-37877 --- ruby.spec | 20 + ....2.9-Fix-CVE-2024-35176-DoS-in-REXML.patch | 1662 +++++++++++++++++ ...can-1.0.2-Accept-String-as-a-pattern.patch | 264 +++ 3 files changed, 1946 insertions(+) create mode 100644 rubygem-rexml-3.2.9-Fix-CVE-2024-35176-DoS-in-REXML.patch create mode 100644 rubygem-strscan-1.0.2-Accept-String-as-a-pattern.patch diff --git a/ruby.spec b/ruby.spec index e39a49d..4635f3d 100644 --- a/ruby.spec +++ b/ruby.spec @@ -251,6 +251,21 @@ Patch46: ruby-3.0.7-Fix-CVE-2024-27281-RCE-vulnerability-with-rdoc_options.patch # Backported from: # https://github.com/ruby/ruby/commit/6c6dca749d3f732b7be04bae20095a040c50fdb8 Patch47: ruby-3.0.7-Fix-CVE-2024-27282-Memory-address-read-with-Regex.patch +# Fix for REXML CVE-2024-35176 depends on being able to pass a string to the +# scan method in addition to a regex. +# https://github.com/ruby/strscan/pull/4 +Patch48: rubygem-strscan-1.0.2-Accept-String-as-a-pattern.patch +# CVE-2024-35176 REXML: DoS parsing an XML with many `<`s in an attribute value. +# The actual fix for the CVE is https://github.com/ruby/rexml/pull/126 , +# but that PR is depending on the content of a few previous PRs and commits. +# https://github.com/ruby/rexml/commit/694239f0855668c986feba6f1b395ecd94a1f0bc +# https://github.com/ruby/rexml/commit/810d2285235d5501a0a124f300832e6e9515da3c +# https://github.com/ruby/rexml/commit/77128555476cb0db798e2912fb3a07d6411dc320 +# https://github.com/ruby/rexml/commit/370666e314816b57ecd5878e757224c3b6bc93f5 +# https://github.com/ruby/rexml/commit/0496940d5998ccbc50d16fb734993ab50fc60c2d +# https://github.com/ruby/rexml/commit/4325835f92f3f142ebd91a3fdba4e1f1ab7f1cfb +# https://github.com/ruby/rexml/commit/f1df7d13b3e57a5e059273d2f0870163c08d7420 +Patch49: rubygem-rexml-3.2.9-Fix-CVE-2024-35176-DoS-in-REXML.patch Requires: %{name}-libs%{?_isa} = %{version}-%{release} @@ -669,6 +684,8 @@ sed -i 's/"evaluation\/incorrect_words.yaml"\.freeze, //' \ %patch45 -p1 %patch46 -p1 %patch47 -p1 +%patch48 -p1 +%patch49 -p1 # Provide an example of usage of the tapset: cp -a %{SOURCE3} . @@ -1234,6 +1251,9 @@ OPENSSL_SYSTEM_CIPHERS_OVERRIDE=xyz_nonexistent_file OPENSSL_CONF='' \ - Fix Arbitrary memory address read vulnerability with Regex search. (CVE-2024-27282) Resolves: RHEL-33867 +- Fix REXML DoS parsing an XML with many `<`s in an attribute value. + (CVE-2024-35176) + Resolves: RHEL-37877 * Mon Jun 12 2023 Jarek Prokop - 2.5.9-111 - Fix HTTP response splitting in CGI. diff --git a/rubygem-rexml-3.2.9-Fix-CVE-2024-35176-DoS-in-REXML.patch b/rubygem-rexml-3.2.9-Fix-CVE-2024-35176-DoS-in-REXML.patch new file mode 100644 index 0000000..bf6d96f --- /dev/null +++ b/rubygem-rexml-3.2.9-Fix-CVE-2024-35176-DoS-in-REXML.patch @@ -0,0 +1,1662 @@ +From 694239f0855668c986feba6f1b395ecd94a1f0bc Mon Sep 17 00:00:00 2001 +From: Kouhei Sutou +Date: Fri, 28 Sep 2018 16:29:17 +0900 +Subject: [PATCH 1/7] Make PI parsing more robust + +PI target must be "NAME - (('X' | 'x') ('M' | 'm') ('L' | 'l'))". So +'' is invalid because "=" isn't included in NAME. + +Without this change, 'pos="3"' was accepted as PI target. + +=== +Commit implements #process_instruction method for baseparser +that later patches rely upon. + +https://github.com/ruby/rexml/commit/694239f0855668c986feba6f1b395ecd94a1f0bc +--- + lib/rexml/parsers/baseparser.rb | 18 ++++++++----- + test/rexml/data/t75.xml | 2 +- + .../parse/test_processing_instruction.rb | 25 +++++++++++++++++++ + 3 files changed, 38 insertions(+), 7 deletions(-) + create mode 100644 test/rexml/parse/test_processing_instruction.rb + +diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb +index e7ef695912..84a0bcdcec 100644 +--- a/lib/rexml/parsers/baseparser.rb ++++ b/lib/rexml/parsers/baseparser.rb +@@ -61,7 +61,7 @@ class BaseParser + XMLDECL_START = /\A<\?xml\s/u; + XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um + INSTRUCTION_START = /\A<\?/u +- INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um ++ INSTRUCTION_PATTERN = /<\?#{QNAME}(\s+.*?)?\?>/um + TAG_MATCH = /\A<((?>#{QNAME_STR}))/um + CLOSE_MATCH = /\A\s*<\/(#{QNAME_STR})\s*>/um + +@@ -229,7 +229,7 @@ def pull_event + standalone = standalone[1] unless standalone.nil? + return [ :xmldecl, version, encoding, standalone ] + when INSTRUCTION_START +- return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ] ++ return process_instruction + when DOCTYPE_START + base_error_message = "Malformed DOCTYPE" + @source.match(DOCTYPE_START, true) +@@ -395,10 +395,7 @@ def pull_event + raise REXML::ParseException.new( "Declarations can only occur "+ + "in the doctype declaration.", @source) + elsif @source.buffer[1] == ?? +- md = @source.match( INSTRUCTION_PATTERN, true ) +- return [ :processing_instruction, md[1], md[2] ] if md +- raise REXML::ParseException.new( "Bad instruction declaration", +- @source) ++ return process_instruction + else + # Get the next tag + md = @source.match(TAG_MATCH, true) +@@ -675,6 +672,15 @@ def parse_attributes(prefixes, curr_ns) + end + return attributes, closed + end ++ ++ def process_instruction ++ match_data = @source.match(INSTRUCTION_PATTERN, true) ++ unless match_data ++ message = "Invalid processing instruction node" ++ raise REXML::ParseException.new(message, @source) ++ end ++ [:processing_instruction, match_data[1], match_data[4]] ++ end + end + end + end +diff --git a/test/rexml/data/t75.xml b/test/rexml/data/t75.xml +index 0911fb1b1a..eb3cccee4b 100644 +--- a/test/rexml/data/t75.xml ++++ b/test/rexml/data/t75.xml +@@ -1,4 +1,4 @@ +- ++ + + + ") ++ end ++ assert_equal(<<-DETAIL.chomp, exception.to_s) ++Invalid processing instruction node ++Line: 1 ++Position: 4 ++Last 80 unconsumed characters: ++ ++ DETAIL ++ end ++ end ++ end ++end + +From 810d2285235d5501a0a124f300832e6e9515da3c Mon Sep 17 00:00:00 2001 +From: NAITOH Jun +Date: Wed, 17 Jan 2024 15:32:57 +0900 +Subject: [PATCH 2/7] Use string scanner with baseparser (#105) + +Using StringScanner reduces the string copying process and speeds up the +process. + +And I removed unnecessary methods. + +https://github.com/ruby/rexml/actions/runs/7549990000/job/20554906140?pr=105 + +``` +ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [x86_64-linux] +Calculating ------------------------------------- + rexml 3.2.6 master 3.2.6(YJIT) master(YJIT) + dom 4.868 5.077 8.137 8.303 i/s - 100.000 times in 20.540529s 19.696590s 12.288900s 12.043666s + sax 13.597 13.953 19.206 20.948 i/s - 100.000 times in 7.354343s 7.167142s 5.206745s 4.773765s + pull 15.641 16.918 22.266 25.378 i/s - 100.000 times in 6.393424s 5.910955s 4.491201s 3.940471s + stream 14.339 15.844 19.810 22.206 i/s - 100.000 times in 6.973856s 6.311350s 5.047957s 4.503244s + +Comparison: + dom + master(YJIT): 8.3 i/s + 3.2.6(YJIT): 8.1 i/s - 1.02x slower + master: 5.1 i/s - 1.64x slower + rexml 3.2.6: 4.9 i/s - 1.71x slower + + sax + master(YJIT): 20.9 i/s + 3.2.6(YJIT): 19.2 i/s - 1.09x slower + master: 14.0 i/s - 1.50x slower + rexml 3.2.6: 13.6 i/s - 1.54x slower + + pull + master(YJIT): 25.4 i/s + 3.2.6(YJIT): 22.3 i/s - 1.14x slower + master: 16.9 i/s - 1.50x slower + rexml 3.2.6: 15.6 i/s - 1.62x slower + + stream + master(YJIT): 22.2 i/s + 3.2.6(YJIT): 19.8 i/s - 1.12x slower + master: 15.8 i/s - 1.40x slower + rexml 3.2.6: 14.3 i/s - 1.55x slower +``` + +- YJIT=ON : 1.02x - 1.14x faster +- YJIT=OFF : 1.02x - 1.10x faster + +--------- + +Co-authored-by: Sutou Kouhei + +=== +The PR focuses on enhancements in lib/rexml/source.rb. +StringScanner is used instead of just a string for scanning and +matching. +This behavior is relied upon in later patches. + +Preparation for fix for CVE-2024-35176. + +https://github.com/ruby/rexml/commit/810d2285235d5501a0a124f300832e6e9515da3c +--- + lib/rexml/parsers/baseparser.rb | 21 ++- + lib/rexml/source.rb | 149 ++++++-------------- + test/rexml/parse/test_entity_declaration.rb | 36 +++++ + test/rexml/test_core.rb | 2 +- + 4 files changed, 93 insertions(+), 115 deletions(-) + create mode 100644 test/rexml/parse/test_entity_declaration.rb + +diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb +index 84a0bcdcec..c29783af5c 100644 +--- a/lib/rexml/parsers/baseparser.rb ++++ b/lib/rexml/parsers/baseparser.rb +@@ -98,7 +98,7 @@ class BaseParser + ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" + PEDECL = "" + GEDECL = "" +- ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um ++ ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um + + NOTATIONDECL_START = /\A\s*0 +- rv +- end +- + def read + end + +- def consume( pattern ) +- @buffer = $' if pattern.match( @buffer ) +- end +- +- def match_to( char, pattern ) +- return pattern.match(@buffer) +- end +- +- def match_to_consume( char, pattern ) +- md = pattern.match(@buffer) +- @buffer = $' +- return md +- end +- + def match(pattern, cons=false) +- md = pattern.match(@buffer) +- @buffer = $' if cons and md +- return md ++ if cons ++ @scanner.scan(pattern).nil? ? nil : @scanner ++ else ++ @scanner.check(pattern).nil? ? nil : @scanner ++ end + end + + # @return true if the Source is exhausted + def empty? +- @buffer == "" +- end +- +- def position +- @orig.index( @buffer ) ++ @scanner.eos? + end + + # @return the current line in the source + def current_line + lines = @orig.split +- res = lines.grep @buffer[0..30] ++ res = lines.grep @scanner.rest[0..30] + res = res[-1] if res.kind_of? Array + lines.index( res ) if res + end + + private ++ + def detect_encoding +- buffer_encoding = @buffer.encoding ++ scanner_encoding = @scanner.rest.encoding + detected_encoding = "UTF-8" + begin +- @buffer.force_encoding("ASCII-8BIT") +- if @buffer[0, 2] == "\xfe\xff" +- @buffer[0, 2] = "" ++ @scanner.string.force_encoding("ASCII-8BIT") ++ if @scanner.scan(/\xfe\xff/n) + detected_encoding = "UTF-16BE" +- elsif @buffer[0, 2] == "\xff\xfe" +- @buffer[0, 2] = "" ++ elsif @scanner.scan(/\xff\xfe/n) + detected_encoding = "UTF-16LE" +- elsif @buffer[0, 3] == "\xef\xbb\xbf" +- @buffer[0, 3] = "" ++ elsif @scanner.scan(/\xef\xbb\xbf/n) + detected_encoding = "UTF-8" + end + ensure +- @buffer.force_encoding(buffer_encoding) ++ @scanner.string.force_encoding(scanner_encoding) + end + self.encoding = detected_encoding + end + + def encoding_updated + if @encoding != 'UTF-8' +- @buffer = decode(@buffer) ++ @scanner.string = decode(@scanner.rest) + @to_utf = true + else + @to_utf = false +- @buffer.force_encoding ::Encoding::UTF_8 ++ @scanner.string.force_encoding(::Encoding::UTF_8) + end + end + end +@@ -172,7 +138,7 @@ def initialize(arg, block_size=500, encoding=nil) + end + + if !@to_utf and +- @buffer.respond_to?(:force_encoding) and ++ @orig.respond_to?(:force_encoding) and + @source.respond_to?(:external_encoding) and + @source.external_encoding != ::Encoding::UTF_8 + @force_utf8 = true +@@ -181,65 +147,44 @@ def initialize(arg, block_size=500, encoding=nil) + end + end + +- def scan(pattern, cons=false) +- rv = super +- # You'll notice that this next section is very similar to the same +- # section in match(), but just a liiittle different. This is +- # because it is a touch faster to do it this way with scan() +- # than the way match() does it; enough faster to warrant duplicating +- # some code +- if rv.size == 0 +- until @buffer =~ pattern or @source.nil? +- begin +- @buffer << readline +- rescue Iconv::IllegalSequence +- raise +- rescue +- @source = nil +- end +- end +- rv = super +- end +- rv.taint +- rv +- end +- + def read + begin +- @buffer << readline ++ # NOTE: `@scanner << readline` does not free memory, so when parsing huge XML in JRuby's DOM, ++ # out-of-memory error `Java::JavaLang::OutOfMemoryError: Java heap space` occurs. ++ # `@scanner.string = @scanner.rest + readline` frees memory that is already consumed ++ # and avoids this problem. ++ @scanner.string = @scanner.rest + readline + rescue Exception, NameError + @source = nil + end + end + +- def consume( pattern ) +- match( pattern, true ) +- end +- + def match( pattern, cons=false ) +- rv = pattern.match(@buffer) +- @buffer = $' if cons and rv +- while !rv and @source ++ if cons ++ md = @scanner.scan(pattern) ++ else ++ md = @scanner.check(pattern) ++ end ++ while md.nil? and @source + begin +- @buffer << readline +- rv = pattern.match(@buffer) +- @buffer = $' if cons and rv ++ @scanner << readline ++ if cons ++ md = @scanner.scan(pattern) ++ else ++ md = @scanner.check(pattern) ++ end + rescue + @source = nil + end + end +- rv.taint +- rv ++ ++ md.nil? ? nil : @scanner + end + + def empty? + super and ( @source.nil? || @source.eof? ) + end + +- def position +- @er_source.pos rescue 0 +- end +- + # @return the current line in the source + def current_line + begin +@@ -289,7 +234,7 @@ def encoding_updated + @source.set_encoding(@encoding, @encoding) + end + @line_break = encode(">") +- @pending_buffer, @buffer = @buffer, "" ++ @pending_buffer, @scanner.string = @scanner.rest, "" + @pending_buffer.force_encoding(@encoding) + super + end +diff --git a/test/rexml/parse/test_entity_declaration.rb b/test/rexml/parse/test_entity_declaration.rb +new file mode 100644 +index 0000000000..e15deec60d +--- /dev/null ++++ b/test/rexml/parse/test_entity_declaration.rb +@@ -0,0 +1,36 @@ ++# frozen_string_literal: false ++require 'test/unit' ++require 'rexml/document' ++ ++module REXMLTests ++ class TestParseEntityDeclaration < Test::Unit::TestCase ++ private ++ def xml(internal_subset) ++ <<-XML ++ ++ ++ XML ++ end ++ ++ def parse(internal_subset) ++ REXML::Document.new(xml(internal_subset)).doctype ++ end ++ ++ def test_empty ++ exception = assert_raise(REXML::ParseException) do ++ parse(<<-INTERNAL_SUBSET) ++ ++ INTERNAL_SUBSET ++ end ++ assert_equal(<<-DETAIL.chomp, exception.to_s) ++Malformed notation declaration: name is missing ++Line: 5 ++Position: 72 ++Last 80 unconsumed characters: ++ ]> ++ DETAIL ++ end ++ end ++end +diff --git a/test/rexml/test_core.rb b/test/rexml/test_core.rb +index ee5438d5e5..d4ece491b9 100644 +--- a/test/rexml/test_core.rb ++++ b/test/rexml/test_core.rb +@@ -681,7 +681,7 @@ def test_iso_8859_1_output_function + koln_iso_8859_1 = "K\xF6ln" + koln_utf8 = "K\xc3\xb6ln" + source = Source.new( koln_iso_8859_1, 'iso-8859-1' ) +- results = source.scan(/.*/)[0] ++ results = source.match(/.*/)[0] + koln_utf8.force_encoding('UTF-8') if koln_utf8.respond_to?(:force_encoding) + assert_equal koln_utf8, results + output << results + +From 77128555476cb0db798e2912fb3a07d6411dc320 Mon Sep 17 00:00:00 2001 +From: NAITOH Jun +Date: Sun, 21 Jan 2024 20:02:00 +0900 +Subject: [PATCH 3/7] Use `@scanner << readline` instead of `@scanner.string = + @scanner.rest + readline` (#107) + +## Why + +JRuby's `StringScanner#<<` and `StringScanner#scan` OutOfMemoryError has +been resolved in strscan gem 3.0.9. + +https://github.com/ruby/strscan/issues/83 + +## Benchmark + +``` +RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml +ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22] +Calculating ------------------------------------- + before after before(YJIT) after(YJIT) + dom 10.958 11.044 16.615 16.783 i/s - 100.000 times in 9.126104s 9.055023s 6.018799s 5.958437s + sax 29.624 29.609 44.390 45.370 i/s - 100.000 times in 3.375641s 3.377372s 2.252774s 2.204080s + pull 33.868 34.695 51.173 53.492 i/s - 100.000 times in 2.952679s 2.882229s 1.954138s 1.869422s + stream 31.719 32.351 43.604 45.403 i/s - 100.000 times in 3.152713s 3.091052s 2.293356s 2.202514s + +Comparison: + dom + after(YJIT): 16.8 i/s + before(YJIT): 16.6 i/s - 1.01x slower + after: 11.0 i/s - 1.52x slower + before: 11.0 i/s - 1.53x slower + + sax + after(YJIT): 45.4 i/s + before(YJIT): 44.4 i/s - 1.02x slower + before: 29.6 i/s - 1.53x slower + after: 29.6 i/s - 1.53x slower + + pull + after(YJIT): 53.5 i/s + before(YJIT): 51.2 i/s - 1.05x slower + after: 34.7 i/s - 1.54x slower + before: 33.9 i/s - 1.58x slower + + stream + after(YJIT): 45.4 i/s + before(YJIT): 43.6 i/s - 1.04x slower + after: 32.4 i/s - 1.40x slower + before: 31.7 i/s - 1.43x slower + +``` + +- YJIT=ON : 1.01x - 1.05x faster +- YJIT=OFF : 1.00x - 1.02x faster +=== +Backported, as the PR #114 and mainly PR #126 count with the method using `<<` +instead of setting the scanner's string directly. + +https://github.com/ruby/rexml/commit/77128555476cb0db798e2912fb3a07d6411dc320 +--- + lib/rexml/source.rb | 6 +----- + 1 file changed, 1 insertion(+), 5 deletions(-) + +diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb +index 9883e79210..4624777fa0 100644 +--- a/lib/rexml/source.rb ++++ b/lib/rexml/source.rb +@@ -149,11 +149,7 @@ def initialize(arg, block_size=500, encoding=nil) + + def read + begin +- # NOTE: `@scanner << readline` does not free memory, so when parsing huge XML in JRuby's DOM, +- # out-of-memory error `Java::JavaLang::OutOfMemoryError: Java heap space` occurs. +- # `@scanner.string = @scanner.rest + readline` frees memory that is already consumed +- # and avoids this problem. +- @scanner.string = @scanner.rest + readline ++ @scanner << readline + rescue Exception, NameError + @source = nil + end + +From 370666e314816b57ecd5878e757224c3b6bc93f5 Mon Sep 17 00:00:00 2001 +From: NAITOH Jun +Date: Tue, 27 Feb 2024 09:48:35 +0900 +Subject: [PATCH 4/7] Use more StringScanner based API to parse XML (#114) + +## Why? + +Improve maintainability by optimizing the process so that the parsing +process proceeds using StringScanner#scan. + +## Changed +- Change `REXML::Parsers::BaseParser` from `frozen_string_literal: +false` to `frozen_string_literal: true`. +- Added `Source#string=` method for error message output. +- Added TestParseDocumentTypeDeclaration#test_no_name test case. +- Of the `intSubset` of DOCTYPE, " + +=== +This commit introduces wider use of StringScanner in +lib/rexml/parsers/baseparse.rb. + +Which is relied upon in later patches. + +MISSING_ATTRIBUTE_QUOTES is not present in upstream +lib/rexml/parsers/baseparser.rb at this point. +While it doesn't seem to be used, this patch does not +remove the constant so let's keep it around. + +https://github.com/ruby/rexml/commit/370666e314816b57ecd5878e757224c3b6bc93f5 +--- + lib/rexml/parsers/baseparser.rb | 328 ++++++++++-------- + lib/rexml/source.rb | 31 +- + .../parse/test_document_type_declaration.rb | 15 + + 3 files changed, 205 insertions(+), 169 deletions(-) + +diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb +index c29783af5c..dafc1b1449 100644 +--- a/lib/rexml/parsers/baseparser.rb ++++ b/lib/rexml/parsers/baseparser.rb +@@ -1,4 +1,4 @@ +-# frozen_string_literal: false ++# frozen_string_literal: true + + require "strscan" + +@@ -121,6 +121,19 @@ class BaseParser + ###################################################################### + MISSING_ATTRIBUTE_QUOTES = /^<#{QNAME_STR}\s+#{QNAME_STR}\s*=\s*[^"']/um + ++ module Private ++ INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um ++ TAG_PATTERN = /((?>#{QNAME_STR}))/um ++ CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um ++ ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um ++ NAME_PATTERN = /\s*#{NAME}/um ++ GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" ++ PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" ++ ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um ++ end ++ private_constant :Private ++ include Private ++ + def initialize( source ) + self.stream = source + @listeners = [] +@@ -206,180 +219,172 @@ def pull_event + #STDERR.puts @source.encoding + #STDERR.puts "BUFFER = #{@source.buffer.inspect}" + if @document_status == nil +- word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um ) +- word = word[1] unless word.nil? +- #STDERR.puts "WORD = #{word.inspect}" +- case word +- when COMMENT_START +- return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ] +- when XMLDECL_START +- #STDERR.puts "XMLDECL" +- results = @source.match( XMLDECL_PATTERN, true )[1] +- version = VERSION.match( results ) +- version = version[1] unless version.nil? +- encoding = ENCODING.match(results) +- encoding = encoding[1] unless encoding.nil? +- if need_source_encoding_update?(encoding) +- @source.encoding = encoding +- end +- if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding +- encoding = "UTF-16" +- end +- standalone = STANDALONE.match(results) +- standalone = standalone[1] unless standalone.nil? +- return [ :xmldecl, version, encoding, standalone ] +- when INSTRUCTION_START ++ if @source.match("/um, true) +- id = [nil, nil, nil] +- @document_status = :after_doctype +- else +- id = parse_id(base_error_message, +- accept_external_id: true, +- accept_public_id: false) +- if id[0] == "SYSTEM" +- # For backward compatibility +- id[1], id[2] = id[2], nil ++ elsif @source.match("/um, true)[1] ] ++ elsif @source.match("DOCTYPE", true) ++ base_error_message = "Malformed DOCTYPE" ++ unless @source.match(/\s+/um, true) ++ if @source.match(">") ++ message = "#{base_error_message}: name is missing" ++ else ++ message = "#{base_error_message}: invalid name" ++ end ++ @source.string = "/um, true) ++ @nsstack.unshift(curr_ns=Set.new) ++ name = parse_name(base_error_message) ++ if @source.match(/\s*\[/um, true) ++ id = [nil, nil, nil] ++ @document_status = :in_doctype ++ elsif @source.match(/\s*>/um, true) ++ id = [nil, nil, nil] + @document_status = :after_doctype + else +- message = "#{base_error_message}: garbage after external ID" +- raise REXML::ParseException.new(message, @source) ++ id = parse_id(base_error_message, ++ accept_external_id: true, ++ accept_public_id: false) ++ if id[0] == "SYSTEM" ++ # For backward compatibility ++ id[1], id[2] = id[2], nil ++ end ++ if @source.match(/\s*\[/um, true) ++ @document_status = :in_doctype ++ elsif @source.match(/\s*>/um, true) ++ @document_status = :after_doctype ++ else ++ message = "#{base_error_message}: garbage after external ID" ++ raise REXML::ParseException.new(message, @source) ++ end + end +- end +- args = [:start_doctype, name, *id] +- if @document_status == :after_doctype +- @source.match(/\A\s*/um, true) +- @stack << [ :end_doctype ] +- end +- return args +- when /\A\s+/ +- else +- @document_status = :after_doctype +- if @source.encoding == "UTF-8" +- @source.buffer_encoding = ::Encoding::UTF_8 ++ args = [:start_doctype, name, *id] ++ if @document_status == :after_doctype ++ @source.match(/\s*/um, true) ++ @stack << [ :end_doctype ] ++ end ++ return args ++ else ++ message = "Invalid XML" ++ raise REXML::ParseException.new(message, @source) + end + end + end + if @document_status == :in_doctype +- md = @source.match(/\A\s*(.*?>)/um) +- case md[1] +- when SYSTEMENTITY +- match = @source.match( SYSTEMENTITY, true )[1] +- return [ :externalentity, match ] +- +- when ELEMENTDECL_START +- return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ] +- +- when ENTITY_START +- match = [:entitydecl, *@source.match( ENTITYDECL, true ).captures.compact] +- ref = false +- if match[1] == '%' +- ref = true +- match.delete_at 1 +- end +- # Now we have to sort out what kind of entity reference this is +- if match[2] == 'SYSTEM' +- # External reference +- match[3] = match[3][1..-2] # PUBID +- match.delete_at(4) if match.size > 4 # Chop out NDATA decl +- # match is [ :entity, name, SYSTEM, pubid(, ndata)? ] +- elsif match[2] == 'PUBLIC' +- # External reference +- match[3] = match[3][1..-2] # PUBID +- match[4] = match[4][1..-2] # HREF +- match.delete_at(5) if match.size > 5 # Chop out NDATA decl +- # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ] +- else +- match[2] = match[2][1..-2] +- match.pop if match.size == 4 +- # match is [ :entity, name, value ] +- end +- match << '%' if ref +- return match +- when ATTLISTDECL_START +- md = @source.match( ATTLISTDECL_PATTERN, true ) +- raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil? +- element = md[1] +- contents = md[0] +- +- pairs = {} +- values = md[0].scan( ATTDEF_RE ) +- values.each do |attdef| +- unless attdef[3] == "#IMPLIED" +- attdef.compact! +- val = attdef[3] +- val = attdef[4] if val == "#FIXED " +- pairs[attdef[0]] = val +- if attdef[0] =~ /^xmlns:(.*)/ +- @nsstack[0] << $1 +- end ++ @source.match(/\s*/um, true) # skip spaces ++ if @source.match("/um, true) ++ raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil? ++ return [ :elementdecl, "/um) +- message = "#{base_error_message}: name is missing" ++ # Now we have to sort out what kind of entity reference this is ++ if match[2] == 'SYSTEM' ++ # External reference ++ match[3] = match[3][1..-2] # PUBID ++ match.delete_at(4) if match.size > 4 # Chop out NDATA decl ++ # match is [ :entity, name, SYSTEM, pubid(, ndata)? ] ++ elsif match[2] == 'PUBLIC' ++ # External reference ++ match[3] = match[3][1..-2] # PUBID ++ match[4] = match[4][1..-2] # HREF ++ match.delete_at(5) if match.size > 5 # Chop out NDATA decl ++ # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ] + else +- message = "#{base_error_message}: invalid declaration name" ++ match[2] = match[2][1..-2] ++ match.pop if match.size == 4 ++ # match is [ :entity, name, value ] + end +- raise REXML::ParseException.new(message, @source) +- end +- name = parse_name(base_error_message) +- id = parse_id(base_error_message, +- accept_external_id: true, +- accept_public_id: true) +- unless @source.match(/\A\s*>/um, true) +- message = "#{base_error_message}: garbage before end >" +- raise REXML::ParseException.new(message, @source) ++ match << '%' if ref ++ return match ++ elsif @source.match("ATTLIST", true) ++ md = @source.match(ATTLISTDECL_END, true) ++ raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil? ++ element = md[1] ++ contents = md[0] ++ ++ pairs = {} ++ values = md[0].scan( ATTDEF_RE ) ++ values.each do |attdef| ++ unless attdef[3] == "#IMPLIED" ++ attdef.compact! ++ val = attdef[3] ++ val = attdef[4] if val == "#FIXED " ++ pairs[attdef[0]] = val ++ if attdef[0] =~ /^xmlns:(.*)/ ++ @nsstack[0] << $1 ++ end ++ end ++ end ++ return [ :attlistdecl, element, pairs, contents ] ++ elsif @source.match("NOTATION", true) ++ base_error_message = "Malformed notation declaration" ++ unless @source.match(/\s+/um, true) ++ if @source.match(">") ++ message = "#{base_error_message}: name is missing" ++ else ++ message = "#{base_error_message}: invalid name" ++ end ++ @source.string = " /um, true) ++ message = "#{base_error_message}: garbage before end >" ++ raise REXML::ParseException.new(message, @source) ++ end ++ return [:notationdecl, name, *id] ++ elsif md = @source.match(/--(.*?)-->/um, true) ++ case md[1] ++ when /--/, /-\z/ ++ raise REXML::ParseException.new("Malformed comment", @source) ++ end ++ return [ :comment, md[1] ] if md + end +- return [:notationdecl, name, *id] +- when DOCTYPE_END ++ elsif match = @source.match(/(%.*?;)\s*/um, true) ++ return [ :externalentity, match[1] ] ++ elsif @source.match(/\]\s*>/um, true) + @document_status = :after_doctype +- @source.match( DOCTYPE_END, true ) + return [ :end_doctype ] + end + end + if @document_status == :after_doctype +- @source.match(/\A\s*/um, true) ++ @source.match(/\s*/um, true) + end + begin +- @source.read if @source.buffer.size<2 +- if @source.buffer[0] == ?< +- if @source.buffer[1] == ?/ ++ if @source.match("<", true) ++ if @source.match("/", true) + @nsstack.shift + last_tag = @tags.pop +- #md = @source.match_to_consume( '>', CLOSE_MATCH) +- md = @source.match( CLOSE_MATCH, true ) ++ md = @source.match(CLOSE_PATTERN, true) + if md and !last_tag + message = "Unexpected top-level end tag (got '#{md[1]}')" + raise REXML::ParseException.new(message, @source) + end + if md.nil? or last_tag != md[1] + message = "Missing end tag for '#{last_tag}'" +- message << " (got '#{md[1]}')" if md ++ message += " (got '#{md[1]}')" if md ++ @source.string = "]*>)/um) ++ elsif @source.match("!", true) ++ md = @source.match(/([^>]*>)/um) + #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" + raise REXML::ParseException.new("Malformed node", @source) unless md +- if md[0][2] == ?- +- md = @source.match( COMMENT_PATTERN, true ) ++ if md[0][0] == ?- ++ md = @source.match(/--(.*?)-->/um, true) + + case md[1] + when /--/, /-\z/ +@@ -388,19 +393,18 @@ def pull_event + + return [ :comment, md[1] ] if md + else +- md = @source.match( CDATA_PATTERN, true ) ++ md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true) + return [ :cdata, md[1] ] if md + end + raise REXML::ParseException.new( "Declarations can only occur "+ + "in the doctype declaration.", @source) +- elsif @source.buffer[1] == ?? ++ elsif @source.match("?", true) + return process_instruction + else + # Get the next tag +- md = @source.match(TAG_MATCH, true) ++ md = @source.match(TAG_PATTERN, true) + unless md +- # Check for missing attribute quotes +- raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES ) ++ @source.string = "<" + @source.buffer + raise REXML::ParseException.new("malformed XML: missing tag start", @source) + end + tag = md[1] +@@ -425,7 +429,7 @@ def pull_event + return [ :start_element, tag, attributes ] + end + else +- md = @source.match( TEXT_PATTERN, true ) ++ md = @source.match(/([^<]*)/um, true) + text = md[1] + if md[0].length == 0 + @source.match( /(\s+)/, true ) +@@ -472,8 +476,7 @@ def normalize( input, entities=nil, entity_filter=nil ) + + # Unescapes all possible entities + def unnormalize( string, entities=nil, filter=nil ) +- rv = string.clone +- rv.gsub!( /\r\n?/, "\n" ) ++ rv = string.gsub( /\r\n?/, "\n" ) + matches = rv.scan( REFERENCE_RE ) + return rv if matches.size == 0 + rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) { +@@ -508,9 +511,9 @@ def need_source_encoding_update?(xml_declaration_encoding) + end + + def parse_name(base_error_message) +- md = @source.match(/\A\s*#{NAME}/um, true) ++ md = @source.match(NAME_PATTERN, true) + unless md +- if @source.match(/\A\s*\S/um) ++ if @source.match(/\s*\S/um) + message = "#{base_error_message}: invalid name" + else + message = "#{base_error_message}: name is missing" +@@ -671,12 +674,29 @@ def parse_attributes(prefixes, curr_ns) + end + + def process_instruction +- match_data = @source.match(INSTRUCTION_PATTERN, true) ++ match_data = @source.match(INSTRUCTION_END, true) + unless match_data + message = "Invalid processing instruction node" ++ @source.string = " + DETAIL + end ++ ++ def test_no_name ++ exception = assert_raise(REXML::ParseException) do ++ parse(<<-DOCTYPE) ++ ++ DOCTYPE ++ end ++ assert_equal(<<-DETAIL.chomp, exception.to_s) ++Malformed DOCTYPE: name is missing ++Line: 3 ++Position: 17 ++Last 80 unconsumed characters: ++ ++ DETAIL ++ end + end + + class TestExternalID < self + +From 0496940d5998ccbc50d16fb734993ab50fc60c2d Mon Sep 17 00:00:00 2001 +From: NAITOH Jun +Date: Mon, 18 Mar 2024 23:30:47 +0900 +Subject: [PATCH 5/7] Optimize the parse_attributes method to use + `Source#match` to parse XML. (#119) + +## Why? + +Improve maintainability by consolidating processing into `Source#match`. + +## Benchmark +``` +RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml +ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22] +Calculating ------------------------------------- + before after before(YJIT) after(YJIT) + dom 10.891 10.622 16.356 17.403 i/s - 100.000 times in 9.182130s 9.414177s 6.113806s 5.746133s + sax 30.335 29.845 49.749 54.877 i/s - 100.000 times in 3.296483s 3.350595s 2.010071s 1.822259s + pull 35.514 34.801 61.123 66.908 i/s - 100.000 times in 2.815793s 2.873484s 1.636041s 1.494591s + stream 35.141 34.475 52.110 56.836 i/s - 100.000 times in 2.845646s 2.900638s 1.919017s 1.759456s + +Comparison: + dom + after(YJIT): 17.4 i/s + before(YJIT): 16.4 i/s - 1.06x slower + before: 10.9 i/s - 1.60x slower + after: 10.6 i/s - 1.64x slower + + sax + after(YJIT): 54.9 i/s + before(YJIT): 49.7 i/s - 1.10x slower + before: 30.3 i/s - 1.81x slower + after: 29.8 i/s - 1.84x slower + + pull + after(YJIT): 66.9 i/s + before(YJIT): 61.1 i/s - 1.09x slower + before: 35.5 i/s - 1.88x slower + after: 34.8 i/s - 1.92x slower + + stream + after(YJIT): 56.8 i/s + before(YJIT): 52.1 i/s - 1.09x slower + before: 35.1 i/s - 1.62x slower + after: 34.5 i/s - 1.65x slower + +``` + +- YJIT=ON : 1.06x - 1.10x faster +- YJIT=OFF : 0.97x - 0.98x faster +=== +"test_attribute_namespace_conflict" in test/rexml/test_core.rb +adjustment not applied as the commit that added the test is not present. + +Wider use of Source#match to parse XML. + +Changes the way BaseParser#parse_attribute +grabs fields that make up an XML attribute. + +This is relied upon by upstream 4325835f92f3f142ebd91a3fdba4e1f1ab7f1cfb +that actually fixes CVE-2024-35176. + +https://github.com/ruby/rexml/commit/0496940d5998ccbc50d16fb734993ab50fc60c2d +--- + lib/rexml/parsers/baseparser.rb | 113 ++++++++++++------------------- + test/rexml/parse/test_element.rb | 2 +- + test/rexml/test_core.rb | 17 ++++- + 3 files changed, 61 insertions(+), 71 deletions(-) + +diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb +index dafc1b1449..45eabe8cc5 100644 +--- a/lib/rexml/parsers/baseparser.rb ++++ b/lib/rexml/parsers/baseparser.rb +@@ -123,7 +123,7 @@ class BaseParser + + module Private + INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um +- TAG_PATTERN = /((?>#{QNAME_STR}))/um ++ TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um + CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um + ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um + NAME_PATTERN = /\s*#{NAME}/um +@@ -592,85 +592,60 @@ def parse_id_invalid_details(accept_external_id:, + def parse_attributes(prefixes, curr_ns) + attributes = {} + closed = false +- match_data = @source.match(/^(.*?)(\/)?>/um, true) +- if match_data.nil? +- message = "Start tag isn't ended" +- raise REXML::ParseException.new(message, @source) +- end +- +- raw_attributes = match_data[1] +- closed = !match_data[2].nil? +- return attributes, closed if raw_attributes.nil? +- return attributes, closed if raw_attributes.empty? +- +- scanner = StringScanner.new(raw_attributes) +- until scanner.eos? +- if scanner.scan(/\s+/) +- break if scanner.eos? +- end +- +- pos = scanner.pos +- loop do +- break if scanner.scan(ATTRIBUTE_PATTERN) +- unless scanner.scan(QNAME) +- message = "Invalid attribute name: <#{scanner.rest}>" +- raise REXML::ParseException.new(message, @source) +- end +- name = scanner[0] +- unless scanner.scan(/\s*=\s*/um) ++ while true ++ if @source.match(">", true) ++ return attributes, closed ++ elsif @source.match("/>", true) ++ closed = true ++ return attributes, closed ++ elsif match = @source.match(QNAME, true) ++ name = match[1] ++ prefix = match[2] ++ local_part = match[3] ++ ++ unless @source.match(/\s*=\s*/um, true) + message = "Missing attribute equal: <#{name}>" + raise REXML::ParseException.new(message, @source) + end +- quote = scanner.scan(/['"]/) +- unless quote +- message = "Missing attribute value start quote: <#{name}>" +- raise REXML::ParseException.new(message, @source) +- end +- unless scanner.scan(/.*#{Regexp.escape(quote)}/um) +- match_data = @source.match(/^(.*?)(\/)?>/um, true) +- if match_data +- scanner << "/" if closed +- scanner << ">" +- scanner << match_data[1] +- scanner.pos = pos +- closed = !match_data[2].nil? +- next ++ unless match = @source.match(/(['"])(.*?)\1\s*/um, true) ++ if match = @source.match(/(['"])/, true) ++ message = ++ "Missing attribute value end quote: <#{name}>: <#{match[1]}>" ++ raise REXML::ParseException.new(message, @source) ++ else ++ message = "Missing attribute value start quote: <#{name}>" ++ raise REXML::ParseException.new(message, @source) + end +- message = +- "Missing attribute value end quote: <#{name}>: <#{quote}>" +- raise REXML::ParseException.new(message, @source) + end +- end +- name = scanner[1] +- prefix = scanner[2] +- local_part = scanner[3] +- # quote = scanner[4] +- value = scanner[5] +- if prefix == "xmlns" +- if local_part == "xml" +- if value != "http://www.w3.org/XML/1998/namespace" +- msg = "The 'xml' prefix must not be bound to any other namespace "+ ++ value = match[2] ++ if prefix == "xmlns" ++ if local_part == "xml" ++ if value != "http://www.w3.org/XML/1998/namespace" ++ msg = "The 'xml' prefix must not be bound to any other namespace "+ ++ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" ++ raise REXML::ParseException.new( msg, @source, self ) ++ end ++ elsif local_part == "xmlns" ++ msg = "The 'xmlns' prefix must not be declared "+ + "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" +- raise REXML::ParseException.new( msg, @source, self ) ++ raise REXML::ParseException.new( msg, @source, self) + end +- elsif local_part == "xmlns" +- msg = "The 'xmlns' prefix must not be declared "+ +- "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" +- raise REXML::ParseException.new( msg, @source, self) ++ curr_ns << local_part ++ elsif prefix ++ prefixes << prefix unless prefix == "xml" + end +- curr_ns << local_part +- elsif prefix +- prefixes << prefix unless prefix == "xml" +- end + +- if attributes.has_key?(name) +- msg = "Duplicate attribute #{name.inspect}" +- raise REXML::ParseException.new(msg, @source, self) +- end ++ if attributes.has_key?(name) ++ msg = "Duplicate attribute #{name.inspect}" ++ raise REXML::ParseException.new(msg, @source, self) ++ end + +- attributes[name] = value ++ attributes[name] = value ++ else ++ message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>" ++ raise REXML::ParseException.new(message, @source) ++ end + end +- return attributes, closed + end + + def process_instruction +diff --git a/test/rexml/parse/test_element.rb b/test/rexml/parse/test_element.rb +index e8dce4b997..987214f3bb 100644 +--- a/test/rexml/parse/test_element.rb ++++ b/test/rexml/parse/test_element.rb +@@ -43,7 +43,7 @@ def test_empty_namespace_attribute_name + Line: 1 + Position: 13 + Last 80 unconsumed characters: +- ++:a=""> + DETAIL + end + +diff --git a/test/rexml/test_core.rb b/test/rexml/test_core.rb +index d4ece491b9..46de90ccd7 100644 +--- a/test/rexml/test_core.rb ++++ b/test/rexml/test_core.rb +@@ -1277,11 +1277,26 @@ def test_ticket_21 + exception = assert_raise(ParseException) do + Document.new(src) + end +- assert_equal(<<-DETAIL, exception.to_s) ++ assert_equal(<<-DETAIL.chomp, exception.to_s) + Missing attribute value start quote: + Line: 1 + Position: 16 + Last 80 unconsumed characters: ++value/> ++ DETAIL ++ end ++ ++ def test_parse_exception_on_missing_attribute_end_quote ++ src = ' ++Line: 1 ++Position: 17 ++Last 80 unconsumed characters: ++value/> + DETAIL + end + + +From 4325835f92f3f142ebd91a3fdba4e1f1ab7f1cfb Mon Sep 17 00:00:00 2001 +From: Nobuyoshi Nakada +Date: Thu, 16 May 2024 11:26:51 +0900 +Subject: [PATCH 6/7] Read quoted attributes in chunks (#126) + +=== +test/rexml/test_document.rb: +* test_gt_linear_performance +Test not included, Ruby 2.5 does not have assert_linear_performance +method available in the test suite. It was introduced in Ruby 2.7. +* Do not require 'core_assertions' +Current requires are fine for running the tests in mock build. + +The fix for CVE-2024-35176. By reading the field in chunks instead of 1 +by 1 it improves performance for large strings composed of angle +brackets. + +https://github.com/ruby/rexml/commit/4325835f92f3f142ebd91a3fdba4e1f1ab7f1cfb +--- + lib/rexml/parsers/baseparser.rb | 20 ++++++++++---------- + lib/rexml/source.rb | 29 ++++++++++++++++++++++++----- + 2 files changed, 34 insertions(+), 15 deletions(-) + +diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb +index 45eabe8cc5..304835e407 100644 +--- a/lib/rexml/parsers/baseparser.rb ++++ b/lib/rexml/parsers/baseparser.rb +@@ -607,17 +607,17 @@ def parse_attributes(prefixes, curr_ns) + message = "Missing attribute equal: <#{name}>" + raise REXML::ParseException.new(message, @source) + end +- unless match = @source.match(/(['"])(.*?)\1\s*/um, true) +- if match = @source.match(/(['"])/, true) +- message = +- "Missing attribute value end quote: <#{name}>: <#{match[1]}>" +- raise REXML::ParseException.new(message, @source) +- else +- message = "Missing attribute value start quote: <#{name}>" +- raise REXML::ParseException.new(message, @source) +- end ++ unless match = @source.match(/(['"])/, true) ++ message = "Missing attribute value start quote: <#{name}>" ++ raise REXML::ParseException.new(message, @source) ++ end ++ quote = match[1] ++ value = @source.read_until(quote) ++ unless value.chomp!(quote) ++ message = "Missing attribute value end quote: <#{name}>: <#{quote}>" ++ raise REXML::ParseException.new(message, @source) + end +- value = match[2] ++ @source.match(/\s*/um, true) + if prefix == "xmlns" + if local_part == "xml" + if value != "http://www.w3.org/XML/1998/namespace" +diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb +index d2a6ad872e..2a3bd36695 100644 +--- a/lib/rexml/source.rb ++++ b/lib/rexml/source.rb +@@ -65,7 +65,11 @@ def encoding=(enc) + encoding_updated + end + +- def read ++ def read(term = nil) ++ end ++ ++ def read_until(term) ++ @scanner.scan_until(Regexp.union(term)) or @scanner.rest + end + + def match(pattern, cons=false) +@@ -151,9 +155,9 @@ def initialize(arg, block_size=500, encoding=nil) + end + end + +- def read ++ def read(term = nil) + begin +- @scanner << readline ++ @scanner << readline(term) + true + rescue Exception, NameError + @source = nil +@@ -161,6 +165,21 @@ def read + end + end + ++ def read_until(term) ++ pattern = Regexp.union(term) ++ data = [] ++ begin ++ until str = @scanner.scan_until(pattern) ++ @scanner << readline(term) ++ end ++ rescue EOFError ++ @scanner.rest ++ else ++ read if @scanner.eos? and !@source.eof? ++ str ++ end ++ end ++ + def match( pattern, cons=false ) + read if @scanner.eos? && @source + while true +@@ -204,8 +223,8 @@ def current_line + end + + private +- def readline +- str = @source.readline(@line_break) ++ def readline(term = nil) ++ str = @source.readline(term || @line_break) + if @pending_buffer + if str.nil? + str = @pending_buffer + +From f1df7d13b3e57a5e059273d2f0870163c08d7420 Mon Sep 17 00:00:00 2001 +From: Sutou Kouhei +Date: Mon, 20 May 2024 12:17:27 +0900 +Subject: [PATCH 7/7] Add support for old strscan + +Fix GH-132 + +If we support old strscan, users can also use strscan installed as a +default gem. + +Reported by Adam. Thanks!!! + +=== +strscan shipped with Ruby 2.5 lacks a method StringScanner#captures. +ruby/rexml#105 brought this dependency on the method. + +https://bugs.ruby-lang.org/issues/20516#note-11 +https://github.com/ruby/rexml/commit/f1df7d13b3e57a5e059273d2f0870163c08d7420 +--- + lib/rexml/parsers/baseparser.rb | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb +index 304835e407..006d8acb9b 100644 +--- a/lib/rexml/parsers/baseparser.rb ++++ b/lib/rexml/parsers/baseparser.rb +@@ -9,6 +9,17 @@ + + module REXML + module Parsers ++ if StringScanner::Version < "3.0.8" ++ module StringScannerCaptures ++ refine StringScanner do ++ def captures ++ values_at(*(1...size)) ++ end ++ end ++ end ++ using StringScannerCaptures ++ end ++ + # = Using the Pull Parser + # This API is experimental, and subject to change. + # parser = PullParser.new( "texttxet" ) diff --git a/rubygem-strscan-1.0.2-Accept-String-as-a-pattern.patch b/rubygem-strscan-1.0.2-Accept-String-as-a-pattern.patch new file mode 100644 index 0000000..59e734f --- /dev/null +++ b/rubygem-strscan-1.0.2-Accept-String-as-a-pattern.patch @@ -0,0 +1,264 @@ +From e56ac27d19cc3acdf6c1cb13b14224c43df5f5f6 Mon Sep 17 00:00:00 2001 +From: Kouhei Sutou +Date: Thu, 4 Apr 2019 17:52:50 +0900 +Subject: [PATCH] Accept String as a pattern + +It's only for head only match case such as StringScanner#scan. + +If we use a String as a pattern, we can improve match performance. +Here is a result of the including benchmark. It shows String as a +pattern is 1.25x faster than Regexp as a pattern. + + % rake benchmark + /tmp/local/bin/ruby -S benchmark-driver benchmark/scan.yaml + Warming up -------------------------------------- + regexp 12.094M i/s - 12.242M times in 1.012250s (82.69ns/i, 277clocks/i) + string 14.653M i/s - 14.889M times in 1.016124s (68.25ns/i, 252clocks/i) + Calculating ------------------------------------- + regexp 14.713M i/s - 36.281M times in 2.465970s (67.97ns/i, 254clocks/i) + string 18.422M i/s - 43.959M times in 2.386255s (54.28ns/i, 201clocks/i) + + Comparison: + string: 18421631.8 i/s + regexp: 14712660.7 i/s - 1.25x slower + +==== +Backport https://github.com/ruby/strscan/pull/4 for strscan. + +REXML fixes for CVE-2024-35716 depend on this feature. +--- + ext/strscan/strscan.c | 92 +++++++++++++++++++----------- + test/strscan/test_stringscanner.rb | 45 ++++++++++++++- + 2 files changed, 100 insertions(+), 37 deletions(-) + +diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c +index d6168a0d4f..43319b672e 100644 +--- a/ext/strscan/strscan.c ++++ b/ext/strscan/strscan.c +@@ -447,15 +447,18 @@ strscan_set_pos(VALUE self, VALUE v) + } + + static VALUE +-strscan_do_scan(VALUE self, VALUE regex, int succptr, int getstr, int headonly) ++strscan_do_scan(VALUE self, VALUE pattern, int succptr, int getstr, int headonly) + { +- regex_t *rb_reg_prepare_re(VALUE re, VALUE str); + struct strscanner *p; +- regex_t *re; +- long ret; +- int tmpreg; + +- Check_Type(regex, T_REGEXP); ++ if (headonly) { ++ if (!RB_TYPE_P(pattern, T_REGEXP)) { ++ StringValue(pattern); ++ } ++ } ++ else { ++ Check_Type(pattern, T_REGEXP); ++ } + GET_SCANNER(self, p); + + CLEAR_MATCH_STATUS(p); +@@ -463,37 +466,55 @@ strscan_do_scan(VALUE self, VALUE regex, int succptr, int getstr, int headonly) + return Qnil; + } + +- p->regex = regex; +- re = rb_reg_prepare_re(regex, p->str); +- tmpreg = re != RREGEXP_PTR(regex); +- if (!tmpreg) RREGEXP(regex)->usecnt++; ++ if (RB_TYPE_P(pattern, T_REGEXP)) { ++ regex_t *rb_reg_prepare_re(VALUE re, VALUE str); ++ regex_t *re; ++ long ret; ++ int tmpreg; ++ ++ p->regex = pattern; ++ re = rb_reg_prepare_re(pattern, p->str); ++ tmpreg = re != RREGEXP_PTR(pattern); ++ if (!tmpreg) RREGEXP(pattern)->usecnt++; ++ ++ if (headonly) { ++ ret = onig_match(re, (UChar* )CURPTR(p), ++ (UChar* )(CURPTR(p) + S_RESTLEN(p)), ++ (UChar* )CURPTR(p), &(p->regs), ONIG_OPTION_NONE); ++ } ++ else { ++ ret = onig_search(re, ++ (UChar* )CURPTR(p), (UChar* )(CURPTR(p) + S_RESTLEN(p)), ++ (UChar* )CURPTR(p), (UChar* )(CURPTR(p) + S_RESTLEN(p)), ++ &(p->regs), ONIG_OPTION_NONE); ++ } ++ if (!tmpreg) RREGEXP(pattern)->usecnt--; ++ if (tmpreg) { ++ if (RREGEXP(pattern)->usecnt) { ++ onig_free(re); ++ } ++ else { ++ onig_free(RREGEXP_PTR(pattern)); ++ RREGEXP_PTR(pattern) = re; ++ } ++ } + +- if (headonly) { +- ret = onig_match(re, (UChar* )CURPTR(p), +- (UChar* )(CURPTR(p) + S_RESTLEN(p)), +- (UChar* )CURPTR(p), &(p->regs), ONIG_OPTION_NONE); ++ if (ret == -2) rb_raise(ScanError, "regexp buffer overflow"); ++ if (ret < 0) { ++ /* not matched */ ++ return Qnil; ++ } + } + else { +- ret = onig_search(re, +- (UChar* )CURPTR(p), (UChar* )(CURPTR(p) + S_RESTLEN(p)), +- (UChar* )CURPTR(p), (UChar* )(CURPTR(p) + S_RESTLEN(p)), +- &(p->regs), ONIG_OPTION_NONE); +- } +- if (!tmpreg) RREGEXP(regex)->usecnt--; +- if (tmpreg) { +- if (RREGEXP(regex)->usecnt) { +- onig_free(re); ++ rb_enc_check(p->str, pattern); ++ if (S_RESTLEN(p) < RSTRING_LEN(pattern)) { ++ return Qnil; + } +- else { +- onig_free(RREGEXP_PTR(regex)); +- RREGEXP_PTR(regex) = re; ++ if (memcmp(CURPTR(p), RSTRING_PTR(pattern), RSTRING_LEN(pattern)) != 0) { ++ return Qnil; + } +- } +- +- if (ret == -2) rb_raise(ScanError, "regexp buffer overflow"); +- if (ret < 0) { +- /* not matched */ +- return Qnil; ++ onig_region_clear(&(p->regs)); ++ onig_region_set(&(p->regs), 0, 0, RSTRING_LEN(pattern)); + } + + MATCHED(p); +@@ -520,7 +541,8 @@ strscan_do_scan(VALUE self, VALUE regex, int succptr, int getstr, int headonly) + * p s.scan(/\w+/) # -> "test" + * p s.scan(/\w+/) # -> nil + * p s.scan(/\s+/) # -> " " +- * p s.scan(/\w+/) # -> "string" ++ * p s.scan("str") # -> "str" ++ * p s.scan(/\w+/) # -> "ing" + * p s.scan(/./) # -> nil + * + */ +@@ -539,6 +561,7 @@ strscan_scan(VALUE self, VALUE re) + * s = StringScanner.new('test string') + * p s.match?(/\w+/) # -> 4 + * p s.match?(/\w+/) # -> 4 ++ * p s.match?("test") # -> 4 + * p s.match?(/\s+/) # -> nil + */ + static VALUE +@@ -560,7 +583,8 @@ strscan_match_p(VALUE self, VALUE re) + * p s.skip(/\w+/) # -> 4 + * p s.skip(/\w+/) # -> nil + * p s.skip(/\s+/) # -> 1 +- * p s.skip(/\w+/) # -> 6 ++ * p s.skip("st") # -> 2 ++ * p s.skip(/\w+/) # -> 4 + * p s.skip(/./) # -> nil + * + */ +diff --git a/test/strscan/test_stringscanner.rb b/test/strscan/test_stringscanner.rb +index 3423f9cfed..63b1ce1f9b 100644 +--- a/test/strscan/test_stringscanner.rb ++++ b/test/strscan/test_stringscanner.rb +@@ -282,6 +282,22 @@ def test_scan + assert_equal "", s.scan(//) + end + ++ def test_scan_string ++ s = StringScanner.new('stra strb strc') ++ assert_equal 'str', s.scan('str') ++ assert_equal 'str', s[0] ++ assert_equal 3, s.pos ++ assert_equal false, s.tainted? ++ assert_equal 'a ', s.scan('a ') ++ ++ str = 'stra strb strc'.dup ++ str.taint ++ s = StringScanner.new(str, false) ++ matched = s.scan('str') ++ assert_equal 'str', matched ++ assert_equal true, matched.tainted? ++ end ++ + def test_skip + s = StringScanner.new('stra strb strc', true) + assert_equal 4, s.skip(/\w+/) +@@ -367,8 +383,10 @@ def test_matched + assert_equal false, s.matched.tainted? + s.scan(/\s+/) + assert_equal ' ', s.matched ++ s.scan('st') ++ assert_equal 'st', s.matched + s.scan(/\w+/) +- assert_equal 'strb', s.matched ++ assert_equal 'rb', s.matched + s.scan(/\s+/) + assert_equal ' ', s.matched + s.scan(/\w+/) +@@ -483,7 +501,7 @@ def test_pre_match + s.skip(/\s/) + assert_equal 'a', s.pre_match + assert_equal false, s.pre_match.tainted? +- s.scan(/\w/) ++ s.scan('b') + assert_equal 'a ', s.pre_match + s.scan_until(/c/) + assert_equal 'a b ', s.pre_match +@@ -513,7 +531,7 @@ def test_post_match + assert_equal ' b c d e', s.post_match + s.skip(/\s/) + assert_equal 'b c d e', s.post_match +- s.scan(/\w/) ++ s.scan('b') + assert_equal ' c d e', s.post_match + s.scan_until(/c/) + assert_equal ' d e', s.post_match +@@ -589,6 +607,20 @@ def test_encoding + assert_equal(Encoding::EUC_JP, ss.scan(/./e).encoding) + end + ++ def test_encoding_string ++ str = "\xA1\xA2".dup.force_encoding("euc-jp") ++ ss = StringScanner.new(str) ++ assert_equal(str.dup, ss.scan(str.dup)) ++ end ++ ++ def test_invalid_encoding_string ++ str = "\xA1\xA2".dup.force_encoding("euc-jp") ++ ss = StringScanner.new(str) ++ assert_raise(Encoding::CompatibilityError) do ++ ss.scan(str.encode("UTF-8")) ++ end ++ end ++ + def test_generic_regexp + ss = StringScanner.new("\xA1\xA2".dup.force_encoding("euc-jp")) + t = ss.scan(/./) +@@ -643,6 +675,13 @@ def test_exist_p + assert_equal(nil, s.exist?(/e/)) + end + ++ def test_exist_p_string ++ s = StringScanner.new("test string") ++ assert_raise(TypeError) do ++ s.exist?(" ") ++ end ++ end ++ + def test_skip_until + s = StringScanner.new("Foo Bar Baz") + assert_equal(3, s.skip_until(/Foo/))