From 694239f0855668c986feba6f1b395ecd94a1f0bc Mon Sep 17 00:00:00 2001 From: Kouhei Sutou Date: Fri, 28 Sep 2018 16:29:17 +0900 Subject: [PATCH 1/7] Make PI parsing more robust PI target must be "NAME - (('X' | 'x') ('M' | 'm') ('L' | 'l'))". So '' is invalid because "=" isn't included in NAME. Without this change, 'pos="3"' was accepted as PI target. === Commit implements #process_instruction method for baseparser that later patches rely upon. https://github.com/ruby/rexml/commit/694239f0855668c986feba6f1b395ecd94a1f0bc --- lib/rexml/parsers/baseparser.rb | 18 ++++++++----- test/rexml/data/t75.xml | 2 +- .../parse/test_processing_instruction.rb | 25 +++++++++++++++++++ 3 files changed, 38 insertions(+), 7 deletions(-) create mode 100644 test/rexml/parse/test_processing_instruction.rb diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index e7ef695912..84a0bcdcec 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -61,7 +61,7 @@ class BaseParser XMLDECL_START = /\A<\?xml\s/u; XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um INSTRUCTION_START = /\A<\?/u - INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um + INSTRUCTION_PATTERN = /<\?#{QNAME}(\s+.*?)?\?>/um TAG_MATCH = /\A<((?>#{QNAME_STR}))/um CLOSE_MATCH = /\A\s*<\/(#{QNAME_STR})\s*>/um @@ -229,7 +229,7 @@ def pull_event standalone = standalone[1] unless standalone.nil? return [ :xmldecl, version, encoding, standalone ] when INSTRUCTION_START - return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ] + return process_instruction when DOCTYPE_START base_error_message = "Malformed DOCTYPE" @source.match(DOCTYPE_START, true) @@ -395,10 +395,7 @@ def pull_event raise REXML::ParseException.new( "Declarations can only occur "+ "in the doctype declaration.", @source) elsif @source.buffer[1] == ?? - md = @source.match( INSTRUCTION_PATTERN, true ) - return [ :processing_instruction, md[1], md[2] ] if md - raise REXML::ParseException.new( "Bad instruction declaration", - @source) + return process_instruction else # Get the next tag md = @source.match(TAG_MATCH, true) @@ -675,6 +672,15 @@ def parse_attributes(prefixes, curr_ns) end return attributes, closed end + + def process_instruction + match_data = @source.match(INSTRUCTION_PATTERN, true) + unless match_data + message = "Invalid processing instruction node" + raise REXML::ParseException.new(message, @source) + end + [:processing_instruction, match_data[1], match_data[4]] + end end end end diff --git a/test/rexml/data/t75.xml b/test/rexml/data/t75.xml index 0911fb1b1a..eb3cccee4b 100644 --- a/test/rexml/data/t75.xml +++ b/test/rexml/data/t75.xml @@ -1,4 +1,4 @@ - + ") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Invalid processing instruction node +Line: 1 +Position: 4 +Last 80 unconsumed characters: + + DETAIL + end + end + end +end From 810d2285235d5501a0a124f300832e6e9515da3c Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Wed, 17 Jan 2024 15:32:57 +0900 Subject: [PATCH 2/7] Use string scanner with baseparser (#105) Using StringScanner reduces the string copying process and speeds up the process. And I removed unnecessary methods. https://github.com/ruby/rexml/actions/runs/7549990000/job/20554906140?pr=105 ``` ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [x86_64-linux] Calculating ------------------------------------- rexml 3.2.6 master 3.2.6(YJIT) master(YJIT) dom 4.868 5.077 8.137 8.303 i/s - 100.000 times in 20.540529s 19.696590s 12.288900s 12.043666s sax 13.597 13.953 19.206 20.948 i/s - 100.000 times in 7.354343s 7.167142s 5.206745s 4.773765s pull 15.641 16.918 22.266 25.378 i/s - 100.000 times in 6.393424s 5.910955s 4.491201s 3.940471s stream 14.339 15.844 19.810 22.206 i/s - 100.000 times in 6.973856s 6.311350s 5.047957s 4.503244s Comparison: dom master(YJIT): 8.3 i/s 3.2.6(YJIT): 8.1 i/s - 1.02x slower master: 5.1 i/s - 1.64x slower rexml 3.2.6: 4.9 i/s - 1.71x slower sax master(YJIT): 20.9 i/s 3.2.6(YJIT): 19.2 i/s - 1.09x slower master: 14.0 i/s - 1.50x slower rexml 3.2.6: 13.6 i/s - 1.54x slower pull master(YJIT): 25.4 i/s 3.2.6(YJIT): 22.3 i/s - 1.14x slower master: 16.9 i/s - 1.50x slower rexml 3.2.6: 15.6 i/s - 1.62x slower stream master(YJIT): 22.2 i/s 3.2.6(YJIT): 19.8 i/s - 1.12x slower master: 15.8 i/s - 1.40x slower rexml 3.2.6: 14.3 i/s - 1.55x slower ``` - YJIT=ON : 1.02x - 1.14x faster - YJIT=OFF : 1.02x - 1.10x faster --------- Co-authored-by: Sutou Kouhei === The PR focuses on enhancements in lib/rexml/source.rb. StringScanner is used instead of just a string for scanning and matching. This behavior is relied upon in later patches. Preparation for fix for CVE-2024-35176. https://github.com/ruby/rexml/commit/810d2285235d5501a0a124f300832e6e9515da3c --- lib/rexml/parsers/baseparser.rb | 21 ++- lib/rexml/source.rb | 149 ++++++-------------- test/rexml/parse/test_entity_declaration.rb | 36 +++++ test/rexml/test_core.rb | 2 +- 4 files changed, 93 insertions(+), 115 deletions(-) create mode 100644 test/rexml/parse/test_entity_declaration.rb diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 84a0bcdcec..c29783af5c 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -98,7 +98,7 @@ class BaseParser ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" PEDECL = "" GEDECL = "" - ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um + ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um NOTATIONDECL_START = /\A\s*0 - rv - end - def read end - def consume( pattern ) - @buffer = $' if pattern.match( @buffer ) - end - - def match_to( char, pattern ) - return pattern.match(@buffer) - end - - def match_to_consume( char, pattern ) - md = pattern.match(@buffer) - @buffer = $' - return md - end - def match(pattern, cons=false) - md = pattern.match(@buffer) - @buffer = $' if cons and md - return md + if cons + @scanner.scan(pattern).nil? ? nil : @scanner + else + @scanner.check(pattern).nil? ? nil : @scanner + end end # @return true if the Source is exhausted def empty? - @buffer == "" - end - - def position - @orig.index( @buffer ) + @scanner.eos? end # @return the current line in the source def current_line lines = @orig.split - res = lines.grep @buffer[0..30] + res = lines.grep @scanner.rest[0..30] res = res[-1] if res.kind_of? Array lines.index( res ) if res end private + def detect_encoding - buffer_encoding = @buffer.encoding + scanner_encoding = @scanner.rest.encoding detected_encoding = "UTF-8" begin - @buffer.force_encoding("ASCII-8BIT") - if @buffer[0, 2] == "\xfe\xff" - @buffer[0, 2] = "" + @scanner.string.force_encoding("ASCII-8BIT") + if @scanner.scan(/\xfe\xff/n) detected_encoding = "UTF-16BE" - elsif @buffer[0, 2] == "\xff\xfe" - @buffer[0, 2] = "" + elsif @scanner.scan(/\xff\xfe/n) detected_encoding = "UTF-16LE" - elsif @buffer[0, 3] == "\xef\xbb\xbf" - @buffer[0, 3] = "" + elsif @scanner.scan(/\xef\xbb\xbf/n) detected_encoding = "UTF-8" end ensure - @buffer.force_encoding(buffer_encoding) + @scanner.string.force_encoding(scanner_encoding) end self.encoding = detected_encoding end def encoding_updated if @encoding != 'UTF-8' - @buffer = decode(@buffer) + @scanner.string = decode(@scanner.rest) @to_utf = true else @to_utf = false - @buffer.force_encoding ::Encoding::UTF_8 + @scanner.string.force_encoding(::Encoding::UTF_8) end end end @@ -172,7 +138,7 @@ def initialize(arg, block_size=500, encoding=nil) end if !@to_utf and - @buffer.respond_to?(:force_encoding) and + @orig.respond_to?(:force_encoding) and @source.respond_to?(:external_encoding) and @source.external_encoding != ::Encoding::UTF_8 @force_utf8 = true @@ -181,65 +147,44 @@ def initialize(arg, block_size=500, encoding=nil) end end - def scan(pattern, cons=false) - rv = super - # You'll notice that this next section is very similar to the same - # section in match(), but just a liiittle different. This is - # because it is a touch faster to do it this way with scan() - # than the way match() does it; enough faster to warrant duplicating - # some code - if rv.size == 0 - until @buffer =~ pattern or @source.nil? - begin - @buffer << readline - rescue Iconv::IllegalSequence - raise - rescue - @source = nil - end - end - rv = super - end - rv.taint - rv - end - def read begin - @buffer << readline + # NOTE: `@scanner << readline` does not free memory, so when parsing huge XML in JRuby's DOM, + # out-of-memory error `Java::JavaLang::OutOfMemoryError: Java heap space` occurs. + # `@scanner.string = @scanner.rest + readline` frees memory that is already consumed + # and avoids this problem. + @scanner.string = @scanner.rest + readline rescue Exception, NameError @source = nil end end - def consume( pattern ) - match( pattern, true ) - end - def match( pattern, cons=false ) - rv = pattern.match(@buffer) - @buffer = $' if cons and rv - while !rv and @source + if cons + md = @scanner.scan(pattern) + else + md = @scanner.check(pattern) + end + while md.nil? and @source begin - @buffer << readline - rv = pattern.match(@buffer) - @buffer = $' if cons and rv + @scanner << readline + if cons + md = @scanner.scan(pattern) + else + md = @scanner.check(pattern) + end rescue @source = nil end end - rv.taint - rv + + md.nil? ? nil : @scanner end def empty? super and ( @source.nil? || @source.eof? ) end - def position - @er_source.pos rescue 0 - end - # @return the current line in the source def current_line begin @@ -289,7 +234,7 @@ def encoding_updated @source.set_encoding(@encoding, @encoding) end @line_break = encode(">") - @pending_buffer, @buffer = @buffer, "" + @pending_buffer, @scanner.string = @scanner.rest, "" @pending_buffer.force_encoding(@encoding) super end diff --git a/test/rexml/parse/test_entity_declaration.rb b/test/rexml/parse/test_entity_declaration.rb new file mode 100644 index 0000000000..e15deec60d --- /dev/null +++ b/test/rexml/parse/test_entity_declaration.rb @@ -0,0 +1,36 @@ +# frozen_string_literal: false +require 'test/unit' +require 'rexml/document' + +module REXMLTests + class TestParseEntityDeclaration < Test::Unit::TestCase + private + def xml(internal_subset) + <<-XML + + + XML + end + + def parse(internal_subset) + REXML::Document.new(xml(internal_subset)).doctype + end + + def test_empty + exception = assert_raise(REXML::ParseException) do + parse(<<-INTERNAL_SUBSET) + + INTERNAL_SUBSET + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed notation declaration: name is missing +Line: 5 +Position: 72 +Last 80 unconsumed characters: + ]> + DETAIL + end + end +end diff --git a/test/rexml/test_core.rb b/test/rexml/test_core.rb index ee5438d5e5..d4ece491b9 100644 --- a/test/rexml/test_core.rb +++ b/test/rexml/test_core.rb @@ -681,7 +681,7 @@ def test_iso_8859_1_output_function koln_iso_8859_1 = "K\xF6ln" koln_utf8 = "K\xc3\xb6ln" source = Source.new( koln_iso_8859_1, 'iso-8859-1' ) - results = source.scan(/.*/)[0] + results = source.match(/.*/)[0] koln_utf8.force_encoding('UTF-8') if koln_utf8.respond_to?(:force_encoding) assert_equal koln_utf8, results output << results From 77128555476cb0db798e2912fb3a07d6411dc320 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 21 Jan 2024 20:02:00 +0900 Subject: [PATCH 3/7] Use `@scanner << readline` instead of `@scanner.string = @scanner.rest + readline` (#107) ## Why JRuby's `StringScanner#<<` and `StringScanner#scan` OutOfMemoryError has been resolved in strscan gem 3.0.9. https://github.com/ruby/strscan/issues/83 ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 10.958 11.044 16.615 16.783 i/s - 100.000 times in 9.126104s 9.055023s 6.018799s 5.958437s sax 29.624 29.609 44.390 45.370 i/s - 100.000 times in 3.375641s 3.377372s 2.252774s 2.204080s pull 33.868 34.695 51.173 53.492 i/s - 100.000 times in 2.952679s 2.882229s 1.954138s 1.869422s stream 31.719 32.351 43.604 45.403 i/s - 100.000 times in 3.152713s 3.091052s 2.293356s 2.202514s Comparison: dom after(YJIT): 16.8 i/s before(YJIT): 16.6 i/s - 1.01x slower after: 11.0 i/s - 1.52x slower before: 11.0 i/s - 1.53x slower sax after(YJIT): 45.4 i/s before(YJIT): 44.4 i/s - 1.02x slower before: 29.6 i/s - 1.53x slower after: 29.6 i/s - 1.53x slower pull after(YJIT): 53.5 i/s before(YJIT): 51.2 i/s - 1.05x slower after: 34.7 i/s - 1.54x slower before: 33.9 i/s - 1.58x slower stream after(YJIT): 45.4 i/s before(YJIT): 43.6 i/s - 1.04x slower after: 32.4 i/s - 1.40x slower before: 31.7 i/s - 1.43x slower ``` - YJIT=ON : 1.01x - 1.05x faster - YJIT=OFF : 1.00x - 1.02x faster === Backported, as the PR #114 and mainly PR #126 count with the method using `<<` instead of setting the scanner's string directly. https://github.com/ruby/rexml/commit/77128555476cb0db798e2912fb3a07d6411dc320 --- lib/rexml/source.rb | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 9883e79210..4624777fa0 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -149,11 +149,7 @@ def initialize(arg, block_size=500, encoding=nil) def read begin - # NOTE: `@scanner << readline` does not free memory, so when parsing huge XML in JRuby's DOM, - # out-of-memory error `Java::JavaLang::OutOfMemoryError: Java heap space` occurs. - # `@scanner.string = @scanner.rest + readline` frees memory that is already consumed - # and avoids this problem. - @scanner.string = @scanner.rest + readline + @scanner << readline rescue Exception, NameError @source = nil end From 370666e314816b57ecd5878e757224c3b6bc93f5 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Tue, 27 Feb 2024 09:48:35 +0900 Subject: [PATCH 4/7] Use more StringScanner based API to parse XML (#114) ## Why? Improve maintainability by optimizing the process so that the parsing process proceeds using StringScanner#scan. ## Changed - Change `REXML::Parsers::BaseParser` from `frozen_string_literal: false` to `frozen_string_literal: true`. - Added `Source#string=` method for error message output. - Added TestParseDocumentTypeDeclaration#test_no_name test case. - Of the `intSubset` of DOCTYPE, " === This commit introduces wider use of StringScanner in lib/rexml/parsers/baseparse.rb. Which is relied upon in later patches. MISSING_ATTRIBUTE_QUOTES is not present in upstream lib/rexml/parsers/baseparser.rb at this point. While it doesn't seem to be used, this patch does not remove the constant so let's keep it around. https://github.com/ruby/rexml/commit/370666e314816b57ecd5878e757224c3b6bc93f5 --- lib/rexml/parsers/baseparser.rb | 328 ++++++++++-------- lib/rexml/source.rb | 31 +- .../parse/test_document_type_declaration.rb | 15 + 3 files changed, 205 insertions(+), 169 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index c29783af5c..dafc1b1449 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -1,4 +1,4 @@ -# frozen_string_literal: false +# frozen_string_literal: true require "strscan" @@ -121,6 +121,19 @@ class BaseParser ###################################################################### MISSING_ATTRIBUTE_QUOTES = /^<#{QNAME_STR}\s+#{QNAME_STR}\s*=\s*[^"']/um + module Private + INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um + TAG_PATTERN = /((?>#{QNAME_STR}))/um + CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um + ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um + NAME_PATTERN = /\s*#{NAME}/um + GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" + PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" + ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um + end + private_constant :Private + include Private + def initialize( source ) self.stream = source @listeners = [] @@ -206,180 +219,172 @@ def pull_event #STDERR.puts @source.encoding #STDERR.puts "BUFFER = #{@source.buffer.inspect}" if @document_status == nil - word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um ) - word = word[1] unless word.nil? - #STDERR.puts "WORD = #{word.inspect}" - case word - when COMMENT_START - return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ] - when XMLDECL_START - #STDERR.puts "XMLDECL" - results = @source.match( XMLDECL_PATTERN, true )[1] - version = VERSION.match( results ) - version = version[1] unless version.nil? - encoding = ENCODING.match(results) - encoding = encoding[1] unless encoding.nil? - if need_source_encoding_update?(encoding) - @source.encoding = encoding - end - if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding - encoding = "UTF-16" - end - standalone = STANDALONE.match(results) - standalone = standalone[1] unless standalone.nil? - return [ :xmldecl, version, encoding, standalone ] - when INSTRUCTION_START + if @source.match("/um, true) - id = [nil, nil, nil] - @document_status = :after_doctype - else - id = parse_id(base_error_message, - accept_external_id: true, - accept_public_id: false) - if id[0] == "SYSTEM" - # For backward compatibility - id[1], id[2] = id[2], nil + elsif @source.match("/um, true)[1] ] + elsif @source.match("DOCTYPE", true) + base_error_message = "Malformed DOCTYPE" + unless @source.match(/\s+/um, true) + if @source.match(">") + message = "#{base_error_message}: name is missing" + else + message = "#{base_error_message}: invalid name" + end + @source.string = "/um, true) + @nsstack.unshift(curr_ns=Set.new) + name = parse_name(base_error_message) + if @source.match(/\s*\[/um, true) + id = [nil, nil, nil] + @document_status = :in_doctype + elsif @source.match(/\s*>/um, true) + id = [nil, nil, nil] @document_status = :after_doctype else - message = "#{base_error_message}: garbage after external ID" - raise REXML::ParseException.new(message, @source) + id = parse_id(base_error_message, + accept_external_id: true, + accept_public_id: false) + if id[0] == "SYSTEM" + # For backward compatibility + id[1], id[2] = id[2], nil + end + if @source.match(/\s*\[/um, true) + @document_status = :in_doctype + elsif @source.match(/\s*>/um, true) + @document_status = :after_doctype + else + message = "#{base_error_message}: garbage after external ID" + raise REXML::ParseException.new(message, @source) + end end - end - args = [:start_doctype, name, *id] - if @document_status == :after_doctype - @source.match(/\A\s*/um, true) - @stack << [ :end_doctype ] - end - return args - when /\A\s+/ - else - @document_status = :after_doctype - if @source.encoding == "UTF-8" - @source.buffer_encoding = ::Encoding::UTF_8 + args = [:start_doctype, name, *id] + if @document_status == :after_doctype + @source.match(/\s*/um, true) + @stack << [ :end_doctype ] + end + return args + else + message = "Invalid XML" + raise REXML::ParseException.new(message, @source) end end end if @document_status == :in_doctype - md = @source.match(/\A\s*(.*?>)/um) - case md[1] - when SYSTEMENTITY - match = @source.match( SYSTEMENTITY, true )[1] - return [ :externalentity, match ] - - when ELEMENTDECL_START - return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ] - - when ENTITY_START - match = [:entitydecl, *@source.match( ENTITYDECL, true ).captures.compact] - ref = false - if match[1] == '%' - ref = true - match.delete_at 1 - end - # Now we have to sort out what kind of entity reference this is - if match[2] == 'SYSTEM' - # External reference - match[3] = match[3][1..-2] # PUBID - match.delete_at(4) if match.size > 4 # Chop out NDATA decl - # match is [ :entity, name, SYSTEM, pubid(, ndata)? ] - elsif match[2] == 'PUBLIC' - # External reference - match[3] = match[3][1..-2] # PUBID - match[4] = match[4][1..-2] # HREF - match.delete_at(5) if match.size > 5 # Chop out NDATA decl - # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ] - else - match[2] = match[2][1..-2] - match.pop if match.size == 4 - # match is [ :entity, name, value ] - end - match << '%' if ref - return match - when ATTLISTDECL_START - md = @source.match( ATTLISTDECL_PATTERN, true ) - raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil? - element = md[1] - contents = md[0] - - pairs = {} - values = md[0].scan( ATTDEF_RE ) - values.each do |attdef| - unless attdef[3] == "#IMPLIED" - attdef.compact! - val = attdef[3] - val = attdef[4] if val == "#FIXED " - pairs[attdef[0]] = val - if attdef[0] =~ /^xmlns:(.*)/ - @nsstack[0] << $1 - end + @source.match(/\s*/um, true) # skip spaces + if @source.match("/um, true) + raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil? + return [ :elementdecl, "/um) - message = "#{base_error_message}: name is missing" + # Now we have to sort out what kind of entity reference this is + if match[2] == 'SYSTEM' + # External reference + match[3] = match[3][1..-2] # PUBID + match.delete_at(4) if match.size > 4 # Chop out NDATA decl + # match is [ :entity, name, SYSTEM, pubid(, ndata)? ] + elsif match[2] == 'PUBLIC' + # External reference + match[3] = match[3][1..-2] # PUBID + match[4] = match[4][1..-2] # HREF + match.delete_at(5) if match.size > 5 # Chop out NDATA decl + # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ] else - message = "#{base_error_message}: invalid declaration name" + match[2] = match[2][1..-2] + match.pop if match.size == 4 + # match is [ :entity, name, value ] end - raise REXML::ParseException.new(message, @source) - end - name = parse_name(base_error_message) - id = parse_id(base_error_message, - accept_external_id: true, - accept_public_id: true) - unless @source.match(/\A\s*>/um, true) - message = "#{base_error_message}: garbage before end >" - raise REXML::ParseException.new(message, @source) + match << '%' if ref + return match + elsif @source.match("ATTLIST", true) + md = @source.match(ATTLISTDECL_END, true) + raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil? + element = md[1] + contents = md[0] + + pairs = {} + values = md[0].scan( ATTDEF_RE ) + values.each do |attdef| + unless attdef[3] == "#IMPLIED" + attdef.compact! + val = attdef[3] + val = attdef[4] if val == "#FIXED " + pairs[attdef[0]] = val + if attdef[0] =~ /^xmlns:(.*)/ + @nsstack[0] << $1 + end + end + end + return [ :attlistdecl, element, pairs, contents ] + elsif @source.match("NOTATION", true) + base_error_message = "Malformed notation declaration" + unless @source.match(/\s+/um, true) + if @source.match(">") + message = "#{base_error_message}: name is missing" + else + message = "#{base_error_message}: invalid name" + end + @source.string = " /um, true) + message = "#{base_error_message}: garbage before end >" + raise REXML::ParseException.new(message, @source) + end + return [:notationdecl, name, *id] + elsif md = @source.match(/--(.*?)-->/um, true) + case md[1] + when /--/, /-\z/ + raise REXML::ParseException.new("Malformed comment", @source) + end + return [ :comment, md[1] ] if md end - return [:notationdecl, name, *id] - when DOCTYPE_END + elsif match = @source.match(/(%.*?;)\s*/um, true) + return [ :externalentity, match[1] ] + elsif @source.match(/\]\s*>/um, true) @document_status = :after_doctype - @source.match( DOCTYPE_END, true ) return [ :end_doctype ] end end if @document_status == :after_doctype - @source.match(/\A\s*/um, true) + @source.match(/\s*/um, true) end begin - @source.read if @source.buffer.size<2 - if @source.buffer[0] == ?< - if @source.buffer[1] == ?/ + if @source.match("<", true) + if @source.match("/", true) @nsstack.shift last_tag = @tags.pop - #md = @source.match_to_consume( '>', CLOSE_MATCH) - md = @source.match( CLOSE_MATCH, true ) + md = @source.match(CLOSE_PATTERN, true) if md and !last_tag message = "Unexpected top-level end tag (got '#{md[1]}')" raise REXML::ParseException.new(message, @source) end if md.nil? or last_tag != md[1] message = "Missing end tag for '#{last_tag}'" - message << " (got '#{md[1]}')" if md + message += " (got '#{md[1]}')" if md + @source.string = "]*>)/um) + elsif @source.match("!", true) + md = @source.match(/([^>]*>)/um) #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" raise REXML::ParseException.new("Malformed node", @source) unless md - if md[0][2] == ?- - md = @source.match( COMMENT_PATTERN, true ) + if md[0][0] == ?- + md = @source.match(/--(.*?)-->/um, true) case md[1] when /--/, /-\z/ @@ -388,19 +393,18 @@ def pull_event return [ :comment, md[1] ] if md else - md = @source.match( CDATA_PATTERN, true ) + md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true) return [ :cdata, md[1] ] if md end raise REXML::ParseException.new( "Declarations can only occur "+ "in the doctype declaration.", @source) - elsif @source.buffer[1] == ?? + elsif @source.match("?", true) return process_instruction else # Get the next tag - md = @source.match(TAG_MATCH, true) + md = @source.match(TAG_PATTERN, true) unless md - # Check for missing attribute quotes - raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES ) + @source.string = "<" + @source.buffer raise REXML::ParseException.new("malformed XML: missing tag start", @source) end tag = md[1] @@ -425,7 +429,7 @@ def pull_event return [ :start_element, tag, attributes ] end else - md = @source.match( TEXT_PATTERN, true ) + md = @source.match(/([^<]*)/um, true) text = md[1] if md[0].length == 0 @source.match( /(\s+)/, true ) @@ -472,8 +476,7 @@ def normalize( input, entities=nil, entity_filter=nil ) # Unescapes all possible entities def unnormalize( string, entities=nil, filter=nil ) - rv = string.clone - rv.gsub!( /\r\n?/, "\n" ) + rv = string.gsub( /\r\n?/, "\n" ) matches = rv.scan( REFERENCE_RE ) return rv if matches.size == 0 rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) { @@ -508,9 +511,9 @@ def need_source_encoding_update?(xml_declaration_encoding) end def parse_name(base_error_message) - md = @source.match(/\A\s*#{NAME}/um, true) + md = @source.match(NAME_PATTERN, true) unless md - if @source.match(/\A\s*\S/um) + if @source.match(/\s*\S/um) message = "#{base_error_message}: invalid name" else message = "#{base_error_message}: name is missing" @@ -671,12 +674,29 @@ def parse_attributes(prefixes, curr_ns) end def process_instruction - match_data = @source.match(INSTRUCTION_PATTERN, true) + match_data = @source.match(INSTRUCTION_END, true) unless match_data message = "Invalid processing instruction node" + @source.string = " DETAIL end + + def test_no_name + exception = assert_raise(REXML::ParseException) do + parse(<<-DOCTYPE) + + DOCTYPE + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed DOCTYPE: name is missing +Line: 3 +Position: 17 +Last 80 unconsumed characters: + + DETAIL + end end class TestExternalID < self From 0496940d5998ccbc50d16fb734993ab50fc60c2d Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Mon, 18 Mar 2024 23:30:47 +0900 Subject: [PATCH 5/7] Optimize the parse_attributes method to use `Source#match` to parse XML. (#119) ## Why? Improve maintainability by consolidating processing into `Source#match`. ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 10.891 10.622 16.356 17.403 i/s - 100.000 times in 9.182130s 9.414177s 6.113806s 5.746133s sax 30.335 29.845 49.749 54.877 i/s - 100.000 times in 3.296483s 3.350595s 2.010071s 1.822259s pull 35.514 34.801 61.123 66.908 i/s - 100.000 times in 2.815793s 2.873484s 1.636041s 1.494591s stream 35.141 34.475 52.110 56.836 i/s - 100.000 times in 2.845646s 2.900638s 1.919017s 1.759456s Comparison: dom after(YJIT): 17.4 i/s before(YJIT): 16.4 i/s - 1.06x slower before: 10.9 i/s - 1.60x slower after: 10.6 i/s - 1.64x slower sax after(YJIT): 54.9 i/s before(YJIT): 49.7 i/s - 1.10x slower before: 30.3 i/s - 1.81x slower after: 29.8 i/s - 1.84x slower pull after(YJIT): 66.9 i/s before(YJIT): 61.1 i/s - 1.09x slower before: 35.5 i/s - 1.88x slower after: 34.8 i/s - 1.92x slower stream after(YJIT): 56.8 i/s before(YJIT): 52.1 i/s - 1.09x slower before: 35.1 i/s - 1.62x slower after: 34.5 i/s - 1.65x slower ``` - YJIT=ON : 1.06x - 1.10x faster - YJIT=OFF : 0.97x - 0.98x faster === "test_attribute_namespace_conflict" in test/rexml/test_core.rb adjustment not applied as the commit that added the test is not present. Wider use of Source#match to parse XML. Changes the way BaseParser#parse_attribute grabs fields that make up an XML attribute. This is relied upon by upstream 4325835f92f3f142ebd91a3fdba4e1f1ab7f1cfb that actually fixes CVE-2024-35176. https://github.com/ruby/rexml/commit/0496940d5998ccbc50d16fb734993ab50fc60c2d --- lib/rexml/parsers/baseparser.rb | 113 ++++++++++++------------------- test/rexml/parse/test_element.rb | 2 +- test/rexml/test_core.rb | 17 ++++- 3 files changed, 61 insertions(+), 71 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index dafc1b1449..45eabe8cc5 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -123,7 +123,7 @@ class BaseParser module Private INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um - TAG_PATTERN = /((?>#{QNAME_STR}))/um + TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um NAME_PATTERN = /\s*#{NAME}/um @@ -592,85 +592,60 @@ def parse_id_invalid_details(accept_external_id:, def parse_attributes(prefixes, curr_ns) attributes = {} closed = false - match_data = @source.match(/^(.*?)(\/)?>/um, true) - if match_data.nil? - message = "Start tag isn't ended" - raise REXML::ParseException.new(message, @source) - end - - raw_attributes = match_data[1] - closed = !match_data[2].nil? - return attributes, closed if raw_attributes.nil? - return attributes, closed if raw_attributes.empty? - - scanner = StringScanner.new(raw_attributes) - until scanner.eos? - if scanner.scan(/\s+/) - break if scanner.eos? - end - - pos = scanner.pos - loop do - break if scanner.scan(ATTRIBUTE_PATTERN) - unless scanner.scan(QNAME) - message = "Invalid attribute name: <#{scanner.rest}>" - raise REXML::ParseException.new(message, @source) - end - name = scanner[0] - unless scanner.scan(/\s*=\s*/um) + while true + if @source.match(">", true) + return attributes, closed + elsif @source.match("/>", true) + closed = true + return attributes, closed + elsif match = @source.match(QNAME, true) + name = match[1] + prefix = match[2] + local_part = match[3] + + unless @source.match(/\s*=\s*/um, true) message = "Missing attribute equal: <#{name}>" raise REXML::ParseException.new(message, @source) end - quote = scanner.scan(/['"]/) - unless quote - message = "Missing attribute value start quote: <#{name}>" - raise REXML::ParseException.new(message, @source) - end - unless scanner.scan(/.*#{Regexp.escape(quote)}/um) - match_data = @source.match(/^(.*?)(\/)?>/um, true) - if match_data - scanner << "/" if closed - scanner << ">" - scanner << match_data[1] - scanner.pos = pos - closed = !match_data[2].nil? - next + unless match = @source.match(/(['"])(.*?)\1\s*/um, true) + if match = @source.match(/(['"])/, true) + message = + "Missing attribute value end quote: <#{name}>: <#{match[1]}>" + raise REXML::ParseException.new(message, @source) + else + message = "Missing attribute value start quote: <#{name}>" + raise REXML::ParseException.new(message, @source) end - message = - "Missing attribute value end quote: <#{name}>: <#{quote}>" - raise REXML::ParseException.new(message, @source) end - end - name = scanner[1] - prefix = scanner[2] - local_part = scanner[3] - # quote = scanner[4] - value = scanner[5] - if prefix == "xmlns" - if local_part == "xml" - if value != "http://www.w3.org/XML/1998/namespace" - msg = "The 'xml' prefix must not be bound to any other namespace "+ + value = match[2] + if prefix == "xmlns" + if local_part == "xml" + if value != "http://www.w3.org/XML/1998/namespace" + msg = "The 'xml' prefix must not be bound to any other namespace "+ + "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" + raise REXML::ParseException.new( msg, @source, self ) + end + elsif local_part == "xmlns" + msg = "The 'xmlns' prefix must not be declared "+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" - raise REXML::ParseException.new( msg, @source, self ) + raise REXML::ParseException.new( msg, @source, self) end - elsif local_part == "xmlns" - msg = "The 'xmlns' prefix must not be declared "+ - "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" - raise REXML::ParseException.new( msg, @source, self) + curr_ns << local_part + elsif prefix + prefixes << prefix unless prefix == "xml" end - curr_ns << local_part - elsif prefix - prefixes << prefix unless prefix == "xml" - end - if attributes.has_key?(name) - msg = "Duplicate attribute #{name.inspect}" - raise REXML::ParseException.new(msg, @source, self) - end + if attributes.has_key?(name) + msg = "Duplicate attribute #{name.inspect}" + raise REXML::ParseException.new(msg, @source, self) + end - attributes[name] = value + attributes[name] = value + else + message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>" + raise REXML::ParseException.new(message, @source) + end end - return attributes, closed end def process_instruction diff --git a/test/rexml/parse/test_element.rb b/test/rexml/parse/test_element.rb index e8dce4b997..987214f3bb 100644 --- a/test/rexml/parse/test_element.rb +++ b/test/rexml/parse/test_element.rb @@ -43,7 +43,7 @@ def test_empty_namespace_attribute_name Line: 1 Position: 13 Last 80 unconsumed characters: - +:a=""> DETAIL end diff --git a/test/rexml/test_core.rb b/test/rexml/test_core.rb index d4ece491b9..46de90ccd7 100644 --- a/test/rexml/test_core.rb +++ b/test/rexml/test_core.rb @@ -1277,11 +1277,26 @@ def test_ticket_21 exception = assert_raise(ParseException) do Document.new(src) end - assert_equal(<<-DETAIL, exception.to_s) + assert_equal(<<-DETAIL.chomp, exception.to_s) Missing attribute value start quote: Line: 1 Position: 16 Last 80 unconsumed characters: +value/> + DETAIL + end + + def test_parse_exception_on_missing_attribute_end_quote + src = ' +Line: 1 +Position: 17 +Last 80 unconsumed characters: +value/> DETAIL end From 4325835f92f3f142ebd91a3fdba4e1f1ab7f1cfb Mon Sep 17 00:00:00 2001 From: Nobuyoshi Nakada Date: Thu, 16 May 2024 11:26:51 +0900 Subject: [PATCH 6/7] Read quoted attributes in chunks (#126) === test/rexml/test_document.rb: * test_gt_linear_performance Test not included, Ruby 2.5 does not have assert_linear_performance method available in the test suite. It was introduced in Ruby 2.7. * Do not require 'core_assertions' Current requires are fine for running the tests in mock build. The fix for CVE-2024-35176. By reading the field in chunks instead of 1 by 1 it improves performance for large strings composed of angle brackets. https://github.com/ruby/rexml/commit/4325835f92f3f142ebd91a3fdba4e1f1ab7f1cfb --- lib/rexml/parsers/baseparser.rb | 20 ++++++++++---------- lib/rexml/source.rb | 29 ++++++++++++++++++++++++----- 2 files changed, 34 insertions(+), 15 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 45eabe8cc5..304835e407 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -607,17 +607,17 @@ def parse_attributes(prefixes, curr_ns) message = "Missing attribute equal: <#{name}>" raise REXML::ParseException.new(message, @source) end - unless match = @source.match(/(['"])(.*?)\1\s*/um, true) - if match = @source.match(/(['"])/, true) - message = - "Missing attribute value end quote: <#{name}>: <#{match[1]}>" - raise REXML::ParseException.new(message, @source) - else - message = "Missing attribute value start quote: <#{name}>" - raise REXML::ParseException.new(message, @source) - end + unless match = @source.match(/(['"])/, true) + message = "Missing attribute value start quote: <#{name}>" + raise REXML::ParseException.new(message, @source) + end + quote = match[1] + value = @source.read_until(quote) + unless value.chomp!(quote) + message = "Missing attribute value end quote: <#{name}>: <#{quote}>" + raise REXML::ParseException.new(message, @source) end - value = match[2] + @source.match(/\s*/um, true) if prefix == "xmlns" if local_part == "xml" if value != "http://www.w3.org/XML/1998/namespace" diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index d2a6ad872e..2a3bd36695 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -65,7 +65,11 @@ def encoding=(enc) encoding_updated end - def read + def read(term = nil) + end + + def read_until(term) + @scanner.scan_until(Regexp.union(term)) or @scanner.rest end def match(pattern, cons=false) @@ -151,9 +155,9 @@ def initialize(arg, block_size=500, encoding=nil) end end - def read + def read(term = nil) begin - @scanner << readline + @scanner << readline(term) true rescue Exception, NameError @source = nil @@ -161,6 +165,21 @@ def read end end + def read_until(term) + pattern = Regexp.union(term) + data = [] + begin + until str = @scanner.scan_until(pattern) + @scanner << readline(term) + end + rescue EOFError + @scanner.rest + else + read if @scanner.eos? and !@source.eof? + str + end + end + def match( pattern, cons=false ) read if @scanner.eos? && @source while true @@ -204,8 +223,8 @@ def current_line end private - def readline - str = @source.readline(@line_break) + def readline(term = nil) + str = @source.readline(term || @line_break) if @pending_buffer if str.nil? str = @pending_buffer From f1df7d13b3e57a5e059273d2f0870163c08d7420 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 20 May 2024 12:17:27 +0900 Subject: [PATCH 7/7] Add support for old strscan Fix GH-132 If we support old strscan, users can also use strscan installed as a default gem. Reported by Adam. Thanks!!! === strscan shipped with Ruby 2.5 lacks a method StringScanner#captures. ruby/rexml#105 brought this dependency on the method. https://bugs.ruby-lang.org/issues/20516#note-11 https://github.com/ruby/rexml/commit/f1df7d13b3e57a5e059273d2f0870163c08d7420 --- lib/rexml/parsers/baseparser.rb | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 304835e407..006d8acb9b 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -9,6 +9,17 @@ module REXML module Parsers + if StringScanner::Version < "3.0.8" + module StringScannerCaptures + refine StringScanner do + def captures + values_at(*(1...size)) + end + end + end + using StringScannerCaptures + end + # = Using the Pull Parser # This API is experimental, and subject to change. # parser = PullParser.new( "texttxet" )