1663 lines
61 KiB
Diff
1663 lines
61 KiB
Diff
From 694239f0855668c986feba6f1b395ecd94a1f0bc Mon Sep 17 00:00:00 2001
|
|
From: Kouhei Sutou <kou@clear-code.com>
|
|
Date: Fri, 28 Sep 2018 16:29:17 +0900
|
|
Subject: [PATCH 1/7] Make PI parsing more robust
|
|
|
|
PI target must be "NAME - (('X' | 'x') ('M' | 'm') ('L' | 'l'))". So
|
|
'<?pos="3"?>' is invalid because "=" isn't included in NAME.
|
|
|
|
Without this change, 'pos="3"' was accepted as PI target.
|
|
|
|
===
|
|
Commit implements #process_instruction method for baseparser
|
|
that later patches rely upon.
|
|
|
|
https://github.com/ruby/rexml/commit/694239f0855668c986feba6f1b395ecd94a1f0bc
|
|
---
|
|
lib/rexml/parsers/baseparser.rb | 18 ++++++++-----
|
|
test/rexml/data/t75.xml | 2 +-
|
|
.../parse/test_processing_instruction.rb | 25 +++++++++++++++++++
|
|
3 files changed, 38 insertions(+), 7 deletions(-)
|
|
create mode 100644 test/rexml/parse/test_processing_instruction.rb
|
|
|
|
diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
|
|
index e7ef695912..84a0bcdcec 100644
|
|
--- a/lib/rexml/parsers/baseparser.rb
|
|
+++ b/lib/rexml/parsers/baseparser.rb
|
|
@@ -61,7 +61,7 @@ class BaseParser
|
|
XMLDECL_START = /\A<\?xml\s/u;
|
|
XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
|
|
INSTRUCTION_START = /\A<\?/u
|
|
- INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um
|
|
+ INSTRUCTION_PATTERN = /<\?#{QNAME}(\s+.*?)?\?>/um
|
|
TAG_MATCH = /\A<((?>#{QNAME_STR}))/um
|
|
CLOSE_MATCH = /\A\s*<\/(#{QNAME_STR})\s*>/um
|
|
|
|
@@ -229,7 +229,7 @@ def pull_event
|
|
standalone = standalone[1] unless standalone.nil?
|
|
return [ :xmldecl, version, encoding, standalone ]
|
|
when INSTRUCTION_START
|
|
- return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ]
|
|
+ return process_instruction
|
|
when DOCTYPE_START
|
|
base_error_message = "Malformed DOCTYPE"
|
|
@source.match(DOCTYPE_START, true)
|
|
@@ -395,10 +395,7 @@ def pull_event
|
|
raise REXML::ParseException.new( "Declarations can only occur "+
|
|
"in the doctype declaration.", @source)
|
|
elsif @source.buffer[1] == ??
|
|
- md = @source.match( INSTRUCTION_PATTERN, true )
|
|
- return [ :processing_instruction, md[1], md[2] ] if md
|
|
- raise REXML::ParseException.new( "Bad instruction declaration",
|
|
- @source)
|
|
+ return process_instruction
|
|
else
|
|
# Get the next tag
|
|
md = @source.match(TAG_MATCH, true)
|
|
@@ -675,6 +672,15 @@ def parse_attributes(prefixes, curr_ns)
|
|
end
|
|
return attributes, closed
|
|
end
|
|
+
|
|
+ def process_instruction
|
|
+ match_data = @source.match(INSTRUCTION_PATTERN, true)
|
|
+ unless match_data
|
|
+ message = "Invalid processing instruction node"
|
|
+ raise REXML::ParseException.new(message, @source)
|
|
+ end
|
|
+ [:processing_instruction, match_data[1], match_data[4]]
|
|
+ end
|
|
end
|
|
end
|
|
end
|
|
diff --git a/test/rexml/data/t75.xml b/test/rexml/data/t75.xml
|
|
index 0911fb1b1a..eb3cccee4b 100644
|
|
--- a/test/rexml/data/t75.xml
|
|
+++ b/test/rexml/data/t75.xml
|
|
@@ -1,4 +1,4 @@
|
|
-<?xml version="1.0" encoding="ISO-8859-1"?><?pos="3"?>
|
|
+<?xml version="1.0" encoding="ISO-8859-1"?>
|
|
<!-- generated by hnb 1.9.17 (http://hnb.sourceforge.net) -->
|
|
|
|
<!DOCTYPE tree[
|
|
diff --git a/test/rexml/parse/test_processing_instruction.rb b/test/rexml/parse/test_processing_instruction.rb
|
|
new file mode 100644
|
|
index 0000000000..a23513fc6e
|
|
--- /dev/null
|
|
+++ b/test/rexml/parse/test_processing_instruction.rb
|
|
@@ -0,0 +1,25 @@
|
|
+require "test/unit"
|
|
+require "rexml/document"
|
|
+
|
|
+module REXMLTests
|
|
+ class TestParseProcessinInstruction < Test::Unit::TestCase
|
|
+ def parse(xml)
|
|
+ REXML::Document.new(xml)
|
|
+ end
|
|
+
|
|
+ class TestInvalid < self
|
|
+ def test_no_name
|
|
+ exception = assert_raise(REXML::ParseException) do
|
|
+ parse("<??>")
|
|
+ end
|
|
+ assert_equal(<<-DETAIL.chomp, exception.to_s)
|
|
+Invalid processing instruction node
|
|
+Line: 1
|
|
+Position: 4
|
|
+Last 80 unconsumed characters:
|
|
+<??>
|
|
+ DETAIL
|
|
+ end
|
|
+ end
|
|
+ end
|
|
+end
|
|
|
|
From 810d2285235d5501a0a124f300832e6e9515da3c Mon Sep 17 00:00:00 2001
|
|
From: NAITOH Jun <naitoh@gmail.com>
|
|
Date: Wed, 17 Jan 2024 15:32:57 +0900
|
|
Subject: [PATCH 2/7] Use string scanner with baseparser (#105)
|
|
|
|
Using StringScanner reduces the string copying process and speeds up the
|
|
process.
|
|
|
|
And I removed unnecessary methods.
|
|
|
|
https://github.com/ruby/rexml/actions/runs/7549990000/job/20554906140?pr=105
|
|
|
|
```
|
|
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [x86_64-linux]
|
|
Calculating -------------------------------------
|
|
rexml 3.2.6 master 3.2.6(YJIT) master(YJIT)
|
|
dom 4.868 5.077 8.137 8.303 i/s - 100.000 times in 20.540529s 19.696590s 12.288900s 12.043666s
|
|
sax 13.597 13.953 19.206 20.948 i/s - 100.000 times in 7.354343s 7.167142s 5.206745s 4.773765s
|
|
pull 15.641 16.918 22.266 25.378 i/s - 100.000 times in 6.393424s 5.910955s 4.491201s 3.940471s
|
|
stream 14.339 15.844 19.810 22.206 i/s - 100.000 times in 6.973856s 6.311350s 5.047957s 4.503244s
|
|
|
|
Comparison:
|
|
dom
|
|
master(YJIT): 8.3 i/s
|
|
3.2.6(YJIT): 8.1 i/s - 1.02x slower
|
|
master: 5.1 i/s - 1.64x slower
|
|
rexml 3.2.6: 4.9 i/s - 1.71x slower
|
|
|
|
sax
|
|
master(YJIT): 20.9 i/s
|
|
3.2.6(YJIT): 19.2 i/s - 1.09x slower
|
|
master: 14.0 i/s - 1.50x slower
|
|
rexml 3.2.6: 13.6 i/s - 1.54x slower
|
|
|
|
pull
|
|
master(YJIT): 25.4 i/s
|
|
3.2.6(YJIT): 22.3 i/s - 1.14x slower
|
|
master: 16.9 i/s - 1.50x slower
|
|
rexml 3.2.6: 15.6 i/s - 1.62x slower
|
|
|
|
stream
|
|
master(YJIT): 22.2 i/s
|
|
3.2.6(YJIT): 19.8 i/s - 1.12x slower
|
|
master: 15.8 i/s - 1.40x slower
|
|
rexml 3.2.6: 14.3 i/s - 1.55x slower
|
|
```
|
|
|
|
- YJIT=ON : 1.02x - 1.14x faster
|
|
- YJIT=OFF : 1.02x - 1.10x faster
|
|
|
|
---------
|
|
|
|
Co-authored-by: Sutou Kouhei <kou@cozmixng.org>
|
|
|
|
===
|
|
The PR focuses on enhancements in lib/rexml/source.rb.
|
|
StringScanner is used instead of just a string for scanning and
|
|
matching.
|
|
This behavior is relied upon in later patches.
|
|
|
|
Preparation for fix for CVE-2024-35176.
|
|
|
|
https://github.com/ruby/rexml/commit/810d2285235d5501a0a124f300832e6e9515da3c
|
|
---
|
|
lib/rexml/parsers/baseparser.rb | 21 ++-
|
|
lib/rexml/source.rb | 149 ++++++--------------
|
|
test/rexml/parse/test_entity_declaration.rb | 36 +++++
|
|
test/rexml/test_core.rb | 2 +-
|
|
4 files changed, 93 insertions(+), 115 deletions(-)
|
|
create mode 100644 test/rexml/parse/test_entity_declaration.rb
|
|
|
|
diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
|
|
index 84a0bcdcec..c29783af5c 100644
|
|
--- a/lib/rexml/parsers/baseparser.rb
|
|
+++ b/lib/rexml/parsers/baseparser.rb
|
|
@@ -98,7 +98,7 @@ class BaseParser
|
|
ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
|
|
PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
|
|
GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
|
|
- ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
|
|
+ ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
|
|
|
|
NOTATIONDECL_START = /\A\s*<!NOTATION/um
|
|
EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
|
|
@@ -268,7 +268,7 @@ def pull_event
|
|
else
|
|
@document_status = :after_doctype
|
|
if @source.encoding == "UTF-8"
|
|
- @source.buffer.force_encoding(::Encoding::UTF_8)
|
|
+ @source.buffer_encoding = ::Encoding::UTF_8
|
|
end
|
|
end
|
|
end
|
|
@@ -283,8 +283,7 @@ def pull_event
|
|
return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
|
|
|
|
when ENTITY_START
|
|
- match = @source.match( ENTITYDECL, true ).to_a.compact
|
|
- match[0] = :entitydecl
|
|
+ match = [:entitydecl, *@source.match( ENTITYDECL, true ).captures.compact]
|
|
ref = false
|
|
if match[1] == '%'
|
|
ref = true
|
|
@@ -404,6 +403,7 @@ def pull_event
|
|
raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES )
|
|
raise REXML::ParseException.new("malformed XML: missing tag start", @source)
|
|
end
|
|
+ tag = md[1]
|
|
@document_status = :in_element
|
|
prefixes = Set.new
|
|
prefixes << md[2] if md[2]
|
|
@@ -417,23 +417,20 @@ def pull_event
|
|
end
|
|
|
|
if closed
|
|
- @closed = md[1]
|
|
+ @closed = tag
|
|
@nsstack.shift
|
|
else
|
|
- @tags.push( md[1] )
|
|
+ @tags.push( tag )
|
|
end
|
|
- return [ :start_element, md[1], attributes ]
|
|
+ return [ :start_element, tag, attributes ]
|
|
end
|
|
else
|
|
md = @source.match( TEXT_PATTERN, true )
|
|
+ text = md[1]
|
|
if md[0].length == 0
|
|
@source.match( /(\s+)/, true )
|
|
end
|
|
- #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
|
|
- #return [ :text, "" ] if md[0].length == 0
|
|
- # unnormalized = Text::unnormalize( md[1], self )
|
|
- # return PullEvent.new( :text, md[1], unnormalized )
|
|
- return [ :text, md[1] ]
|
|
+ return [ :text, text ]
|
|
end
|
|
rescue REXML::UndefinedNamespaceException
|
|
raise
|
|
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
|
|
index af65cf4751..9883e79210 100644
|
|
--- a/lib/rexml/source.rb
|
|
+++ b/lib/rexml/source.rb
|
|
@@ -30,8 +30,6 @@ def SourceFactory::create_from(arg)
|
|
# objects and provides consumption of text
|
|
class Source
|
|
include Encoding
|
|
- # The current buffer (what we're going to read next)
|
|
- attr_reader :buffer
|
|
# The line number of the last consumed text
|
|
attr_reader :line
|
|
attr_reader :encoding
|
|
@@ -41,7 +39,8 @@ class Source
|
|
# @param encoding if non-null, sets the encoding of the source to this
|
|
# value, overriding all encoding detection
|
|
def initialize(arg, encoding=nil)
|
|
- @orig = @buffer = arg
|
|
+ @orig = arg
|
|
+ @scanner = StringScanner.new(@orig)
|
|
if encoding
|
|
self.encoding = encoding
|
|
else
|
|
@@ -50,6 +49,14 @@ def initialize(arg, encoding=nil)
|
|
@line = 0
|
|
end
|
|
|
|
+ # The current buffer (what we're going to read next)
|
|
+ def buffer
|
|
+ @scanner.rest
|
|
+ end
|
|
+
|
|
+ def buffer_encoding=(encoding)
|
|
+ @scanner.string.force_encoding(encoding)
|
|
+ end
|
|
|
|
# Inherited from Encoding
|
|
# Overridden to support optimized en/decoding
|
|
@@ -58,98 +65,57 @@ def encoding=(enc)
|
|
encoding_updated
|
|
end
|
|
|
|
- # Scans the source for a given pattern. Note, that this is not your
|
|
- # usual scan() method. For one thing, the pattern argument has some
|
|
- # requirements; for another, the source can be consumed. You can easily
|
|
- # confuse this method. Originally, the patterns were easier
|
|
- # to construct and this method more robust, because this method
|
|
- # generated search regexps on the fly; however, this was
|
|
- # computationally expensive and slowed down the entire REXML package
|
|
- # considerably, since this is by far the most commonly called method.
|
|
- # @param pattern must be a Regexp, and must be in the form of
|
|
- # /^\s*(#{your pattern, with no groups})(.*)/. The first group
|
|
- # will be returned; the second group is used if the consume flag is
|
|
- # set.
|
|
- # @param consume if true, the pattern returned will be consumed, leaving
|
|
- # everything after it in the Source.
|
|
- # @return the pattern, if found, or nil if the Source is empty or the
|
|
- # pattern is not found.
|
|
- def scan(pattern, cons=false)
|
|
- return nil if @buffer.nil?
|
|
- rv = @buffer.scan(pattern)
|
|
- @buffer = $' if cons and rv.size>0
|
|
- rv
|
|
- end
|
|
-
|
|
def read
|
|
end
|
|
|
|
- def consume( pattern )
|
|
- @buffer = $' if pattern.match( @buffer )
|
|
- end
|
|
-
|
|
- def match_to( char, pattern )
|
|
- return pattern.match(@buffer)
|
|
- end
|
|
-
|
|
- def match_to_consume( char, pattern )
|
|
- md = pattern.match(@buffer)
|
|
- @buffer = $'
|
|
- return md
|
|
- end
|
|
-
|
|
def match(pattern, cons=false)
|
|
- md = pattern.match(@buffer)
|
|
- @buffer = $' if cons and md
|
|
- return md
|
|
+ if cons
|
|
+ @scanner.scan(pattern).nil? ? nil : @scanner
|
|
+ else
|
|
+ @scanner.check(pattern).nil? ? nil : @scanner
|
|
+ end
|
|
end
|
|
|
|
# @return true if the Source is exhausted
|
|
def empty?
|
|
- @buffer == ""
|
|
- end
|
|
-
|
|
- def position
|
|
- @orig.index( @buffer )
|
|
+ @scanner.eos?
|
|
end
|
|
|
|
# @return the current line in the source
|
|
def current_line
|
|
lines = @orig.split
|
|
- res = lines.grep @buffer[0..30]
|
|
+ res = lines.grep @scanner.rest[0..30]
|
|
res = res[-1] if res.kind_of? Array
|
|
lines.index( res ) if res
|
|
end
|
|
|
|
private
|
|
+
|
|
def detect_encoding
|
|
- buffer_encoding = @buffer.encoding
|
|
+ scanner_encoding = @scanner.rest.encoding
|
|
detected_encoding = "UTF-8"
|
|
begin
|
|
- @buffer.force_encoding("ASCII-8BIT")
|
|
- if @buffer[0, 2] == "\xfe\xff"
|
|
- @buffer[0, 2] = ""
|
|
+ @scanner.string.force_encoding("ASCII-8BIT")
|
|
+ if @scanner.scan(/\xfe\xff/n)
|
|
detected_encoding = "UTF-16BE"
|
|
- elsif @buffer[0, 2] == "\xff\xfe"
|
|
- @buffer[0, 2] = ""
|
|
+ elsif @scanner.scan(/\xff\xfe/n)
|
|
detected_encoding = "UTF-16LE"
|
|
- elsif @buffer[0, 3] == "\xef\xbb\xbf"
|
|
- @buffer[0, 3] = ""
|
|
+ elsif @scanner.scan(/\xef\xbb\xbf/n)
|
|
detected_encoding = "UTF-8"
|
|
end
|
|
ensure
|
|
- @buffer.force_encoding(buffer_encoding)
|
|
+ @scanner.string.force_encoding(scanner_encoding)
|
|
end
|
|
self.encoding = detected_encoding
|
|
end
|
|
|
|
def encoding_updated
|
|
if @encoding != 'UTF-8'
|
|
- @buffer = decode(@buffer)
|
|
+ @scanner.string = decode(@scanner.rest)
|
|
@to_utf = true
|
|
else
|
|
@to_utf = false
|
|
- @buffer.force_encoding ::Encoding::UTF_8
|
|
+ @scanner.string.force_encoding(::Encoding::UTF_8)
|
|
end
|
|
end
|
|
end
|
|
@@ -172,7 +138,7 @@ def initialize(arg, block_size=500, encoding=nil)
|
|
end
|
|
|
|
if !@to_utf and
|
|
- @buffer.respond_to?(:force_encoding) and
|
|
+ @orig.respond_to?(:force_encoding) and
|
|
@source.respond_to?(:external_encoding) and
|
|
@source.external_encoding != ::Encoding::UTF_8
|
|
@force_utf8 = true
|
|
@@ -181,65 +147,44 @@ def initialize(arg, block_size=500, encoding=nil)
|
|
end
|
|
end
|
|
|
|
- def scan(pattern, cons=false)
|
|
- rv = super
|
|
- # You'll notice that this next section is very similar to the same
|
|
- # section in match(), but just a liiittle different. This is
|
|
- # because it is a touch faster to do it this way with scan()
|
|
- # than the way match() does it; enough faster to warrant duplicating
|
|
- # some code
|
|
- if rv.size == 0
|
|
- until @buffer =~ pattern or @source.nil?
|
|
- begin
|
|
- @buffer << readline
|
|
- rescue Iconv::IllegalSequence
|
|
- raise
|
|
- rescue
|
|
- @source = nil
|
|
- end
|
|
- end
|
|
- rv = super
|
|
- end
|
|
- rv.taint
|
|
- rv
|
|
- end
|
|
-
|
|
def read
|
|
begin
|
|
- @buffer << readline
|
|
+ # NOTE: `@scanner << readline` does not free memory, so when parsing huge XML in JRuby's DOM,
|
|
+ # out-of-memory error `Java::JavaLang::OutOfMemoryError: Java heap space` occurs.
|
|
+ # `@scanner.string = @scanner.rest + readline` frees memory that is already consumed
|
|
+ # and avoids this problem.
|
|
+ @scanner.string = @scanner.rest + readline
|
|
rescue Exception, NameError
|
|
@source = nil
|
|
end
|
|
end
|
|
|
|
- def consume( pattern )
|
|
- match( pattern, true )
|
|
- end
|
|
-
|
|
def match( pattern, cons=false )
|
|
- rv = pattern.match(@buffer)
|
|
- @buffer = $' if cons and rv
|
|
- while !rv and @source
|
|
+ if cons
|
|
+ md = @scanner.scan(pattern)
|
|
+ else
|
|
+ md = @scanner.check(pattern)
|
|
+ end
|
|
+ while md.nil? and @source
|
|
begin
|
|
- @buffer << readline
|
|
- rv = pattern.match(@buffer)
|
|
- @buffer = $' if cons and rv
|
|
+ @scanner << readline
|
|
+ if cons
|
|
+ md = @scanner.scan(pattern)
|
|
+ else
|
|
+ md = @scanner.check(pattern)
|
|
+ end
|
|
rescue
|
|
@source = nil
|
|
end
|
|
end
|
|
- rv.taint
|
|
- rv
|
|
+
|
|
+ md.nil? ? nil : @scanner
|
|
end
|
|
|
|
def empty?
|
|
super and ( @source.nil? || @source.eof? )
|
|
end
|
|
|
|
- def position
|
|
- @er_source.pos rescue 0
|
|
- end
|
|
-
|
|
# @return the current line in the source
|
|
def current_line
|
|
begin
|
|
@@ -289,7 +234,7 @@ def encoding_updated
|
|
@source.set_encoding(@encoding, @encoding)
|
|
end
|
|
@line_break = encode(">")
|
|
- @pending_buffer, @buffer = @buffer, ""
|
|
+ @pending_buffer, @scanner.string = @scanner.rest, ""
|
|
@pending_buffer.force_encoding(@encoding)
|
|
super
|
|
end
|
|
diff --git a/test/rexml/parse/test_entity_declaration.rb b/test/rexml/parse/test_entity_declaration.rb
|
|
new file mode 100644
|
|
index 0000000000..e15deec60d
|
|
--- /dev/null
|
|
+++ b/test/rexml/parse/test_entity_declaration.rb
|
|
@@ -0,0 +1,36 @@
|
|
+# frozen_string_literal: false
|
|
+require 'test/unit'
|
|
+require 'rexml/document'
|
|
+
|
|
+module REXMLTests
|
|
+ class TestParseEntityDeclaration < Test::Unit::TestCase
|
|
+ private
|
|
+ def xml(internal_subset)
|
|
+ <<-XML
|
|
+<!DOCTYPE r SYSTEM "urn:x-henrikmartensson:test" [
|
|
+#{internal_subset}
|
|
+]>
|
|
+<r/>
|
|
+ XML
|
|
+ end
|
|
+
|
|
+ def parse(internal_subset)
|
|
+ REXML::Document.new(xml(internal_subset)).doctype
|
|
+ end
|
|
+
|
|
+ def test_empty
|
|
+ exception = assert_raise(REXML::ParseException) do
|
|
+ parse(<<-INTERNAL_SUBSET)
|
|
+<!ENTITY>
|
|
+ INTERNAL_SUBSET
|
|
+ end
|
|
+ assert_equal(<<-DETAIL.chomp, exception.to_s)
|
|
+Malformed notation declaration: name is missing
|
|
+Line: 5
|
|
+Position: 72
|
|
+Last 80 unconsumed characters:
|
|
+ <!ENTITY> ]> <r/>
|
|
+ DETAIL
|
|
+ end
|
|
+ end
|
|
+end
|
|
diff --git a/test/rexml/test_core.rb b/test/rexml/test_core.rb
|
|
index ee5438d5e5..d4ece491b9 100644
|
|
--- a/test/rexml/test_core.rb
|
|
+++ b/test/rexml/test_core.rb
|
|
@@ -681,7 +681,7 @@ def test_iso_8859_1_output_function
|
|
koln_iso_8859_1 = "K\xF6ln"
|
|
koln_utf8 = "K\xc3\xb6ln"
|
|
source = Source.new( koln_iso_8859_1, 'iso-8859-1' )
|
|
- results = source.scan(/.*/)[0]
|
|
+ results = source.match(/.*/)[0]
|
|
koln_utf8.force_encoding('UTF-8') if koln_utf8.respond_to?(:force_encoding)
|
|
assert_equal koln_utf8, results
|
|
output << results
|
|
|
|
From 77128555476cb0db798e2912fb3a07d6411dc320 Mon Sep 17 00:00:00 2001
|
|
From: NAITOH Jun <naitoh@gmail.com>
|
|
Date: Sun, 21 Jan 2024 20:02:00 +0900
|
|
Subject: [PATCH 3/7] Use `@scanner << readline` instead of `@scanner.string =
|
|
@scanner.rest + readline` (#107)
|
|
|
|
## Why
|
|
|
|
JRuby's `StringScanner#<<` and `StringScanner#scan` OutOfMemoryError has
|
|
been resolved in strscan gem 3.0.9.
|
|
|
|
https://github.com/ruby/strscan/issues/83
|
|
|
|
## Benchmark
|
|
|
|
```
|
|
RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml
|
|
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22]
|
|
Calculating -------------------------------------
|
|
before after before(YJIT) after(YJIT)
|
|
dom 10.958 11.044 16.615 16.783 i/s - 100.000 times in 9.126104s 9.055023s 6.018799s 5.958437s
|
|
sax 29.624 29.609 44.390 45.370 i/s - 100.000 times in 3.375641s 3.377372s 2.252774s 2.204080s
|
|
pull 33.868 34.695 51.173 53.492 i/s - 100.000 times in 2.952679s 2.882229s 1.954138s 1.869422s
|
|
stream 31.719 32.351 43.604 45.403 i/s - 100.000 times in 3.152713s 3.091052s 2.293356s 2.202514s
|
|
|
|
Comparison:
|
|
dom
|
|
after(YJIT): 16.8 i/s
|
|
before(YJIT): 16.6 i/s - 1.01x slower
|
|
after: 11.0 i/s - 1.52x slower
|
|
before: 11.0 i/s - 1.53x slower
|
|
|
|
sax
|
|
after(YJIT): 45.4 i/s
|
|
before(YJIT): 44.4 i/s - 1.02x slower
|
|
before: 29.6 i/s - 1.53x slower
|
|
after: 29.6 i/s - 1.53x slower
|
|
|
|
pull
|
|
after(YJIT): 53.5 i/s
|
|
before(YJIT): 51.2 i/s - 1.05x slower
|
|
after: 34.7 i/s - 1.54x slower
|
|
before: 33.9 i/s - 1.58x slower
|
|
|
|
stream
|
|
after(YJIT): 45.4 i/s
|
|
before(YJIT): 43.6 i/s - 1.04x slower
|
|
after: 32.4 i/s - 1.40x slower
|
|
before: 31.7 i/s - 1.43x slower
|
|
|
|
```
|
|
|
|
- YJIT=ON : 1.01x - 1.05x faster
|
|
- YJIT=OFF : 1.00x - 1.02x faster
|
|
===
|
|
Backported, as the PR #114 and mainly PR #126 count with the method using `<<`
|
|
instead of setting the scanner's string directly.
|
|
|
|
https://github.com/ruby/rexml/commit/77128555476cb0db798e2912fb3a07d6411dc320
|
|
---
|
|
lib/rexml/source.rb | 6 +-----
|
|
1 file changed, 1 insertion(+), 5 deletions(-)
|
|
|
|
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
|
|
index 9883e79210..4624777fa0 100644
|
|
--- a/lib/rexml/source.rb
|
|
+++ b/lib/rexml/source.rb
|
|
@@ -149,11 +149,7 @@ def initialize(arg, block_size=500, encoding=nil)
|
|
|
|
def read
|
|
begin
|
|
- # NOTE: `@scanner << readline` does not free memory, so when parsing huge XML in JRuby's DOM,
|
|
- # out-of-memory error `Java::JavaLang::OutOfMemoryError: Java heap space` occurs.
|
|
- # `@scanner.string = @scanner.rest + readline` frees memory that is already consumed
|
|
- # and avoids this problem.
|
|
- @scanner.string = @scanner.rest + readline
|
|
+ @scanner << readline
|
|
rescue Exception, NameError
|
|
@source = nil
|
|
end
|
|
|
|
From 370666e314816b57ecd5878e757224c3b6bc93f5 Mon Sep 17 00:00:00 2001
|
|
From: NAITOH Jun <naitoh@gmail.com>
|
|
Date: Tue, 27 Feb 2024 09:48:35 +0900
|
|
Subject: [PATCH 4/7] Use more StringScanner based API to parse XML (#114)
|
|
|
|
## Why?
|
|
|
|
Improve maintainability by optimizing the process so that the parsing
|
|
process proceeds using StringScanner#scan.
|
|
|
|
## Changed
|
|
- Change `REXML::Parsers::BaseParser` from `frozen_string_literal:
|
|
false` to `frozen_string_literal: true`.
|
|
- Added `Source#string=` method for error message output.
|
|
- Added TestParseDocumentTypeDeclaration#test_no_name test case.
|
|
- Of the `intSubset` of DOCTYPE, "<!" added consideration for processing
|
|
`Comments` that begin with "<!".
|
|
|
|
## [Benchmark]
|
|
|
|
```
|
|
RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml
|
|
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22]
|
|
Calculating -------------------------------------
|
|
before after before(YJIT) after(YJIT)
|
|
dom 11.240 10.569 17.173 18.219 i/s - 100.000 times in 8.896882s 9.461267s 5.823007s 5.488884s
|
|
sax 31.812 30.716 48.383 52.532 i/s - 100.000 times in 3.143500s 3.255655s 2.066861s 1.903600s
|
|
pull 36.855 36.354 56.718 61.443 i/s - 100.000 times in 2.713300s 2.750693s 1.763099s 1.627523s
|
|
stream 34.176 34.758 49.801 54.622 i/s - 100.000 times in 2.925991s 2.877065s 2.008003s 1.830779s
|
|
|
|
Comparison:
|
|
dom
|
|
after(YJIT): 18.2 i/s
|
|
before(YJIT): 17.2 i/s - 1.06x slower
|
|
before: 11.2 i/s - 1.62x slower
|
|
after: 10.6 i/s - 1.72x slower
|
|
|
|
sax
|
|
after(YJIT): 52.5 i/s
|
|
before(YJIT): 48.4 i/s - 1.09x slower
|
|
before: 31.8 i/s - 1.65x slower
|
|
after: 30.7 i/s - 1.71x slower
|
|
|
|
pull
|
|
after(YJIT): 61.4 i/s
|
|
before(YJIT): 56.7 i/s - 1.08x slower
|
|
before: 36.9 i/s - 1.67x slower
|
|
after: 36.4 i/s - 1.69x slower
|
|
|
|
stream
|
|
after(YJIT): 54.6 i/s
|
|
before(YJIT): 49.8 i/s - 1.10x slower
|
|
after: 34.8 i/s - 1.57x slower
|
|
before: 34.2 i/s - 1.60x slower
|
|
|
|
```
|
|
|
|
- YJIT=ON : 1.06x - 1.10x faster
|
|
- YJIT=OFF : 0.94x - 1.01x faster
|
|
|
|
---------
|
|
|
|
Co-authored-by: Sutou Kouhei <kou@clear-code.com>
|
|
|
|
===
|
|
This commit introduces wider use of StringScanner in
|
|
lib/rexml/parsers/baseparse.rb.
|
|
|
|
Which is relied upon in later patches.
|
|
|
|
MISSING_ATTRIBUTE_QUOTES is not present in upstream
|
|
lib/rexml/parsers/baseparser.rb at this point.
|
|
While it doesn't seem to be used, this patch does not
|
|
remove the constant so let's keep it around.
|
|
|
|
https://github.com/ruby/rexml/commit/370666e314816b57ecd5878e757224c3b6bc93f5
|
|
---
|
|
lib/rexml/parsers/baseparser.rb | 328 ++++++++++--------
|
|
lib/rexml/source.rb | 31 +-
|
|
.../parse/test_document_type_declaration.rb | 15 +
|
|
3 files changed, 205 insertions(+), 169 deletions(-)
|
|
|
|
diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
|
|
index c29783af5c..dafc1b1449 100644
|
|
--- a/lib/rexml/parsers/baseparser.rb
|
|
+++ b/lib/rexml/parsers/baseparser.rb
|
|
@@ -1,4 +1,4 @@
|
|
-# frozen_string_literal: false
|
|
+# frozen_string_literal: true
|
|
|
|
require "strscan"
|
|
|
|
@@ -121,6 +121,19 @@ class BaseParser
|
|
######################################################################
|
|
MISSING_ATTRIBUTE_QUOTES = /^<#{QNAME_STR}\s+#{QNAME_STR}\s*=\s*[^"']/um
|
|
|
|
+ module Private
|
|
+ INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
|
|
+ TAG_PATTERN = /((?>#{QNAME_STR}))/um
|
|
+ CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
|
|
+ ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
|
|
+ NAME_PATTERN = /\s*#{NAME}/um
|
|
+ GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
|
|
+ PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
|
|
+ ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
|
|
+ end
|
|
+ private_constant :Private
|
|
+ include Private
|
|
+
|
|
def initialize( source )
|
|
self.stream = source
|
|
@listeners = []
|
|
@@ -206,180 +219,172 @@ def pull_event
|
|
#STDERR.puts @source.encoding
|
|
#STDERR.puts "BUFFER = #{@source.buffer.inspect}"
|
|
if @document_status == nil
|
|
- word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
|
|
- word = word[1] unless word.nil?
|
|
- #STDERR.puts "WORD = #{word.inspect}"
|
|
- case word
|
|
- when COMMENT_START
|
|
- return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
|
|
- when XMLDECL_START
|
|
- #STDERR.puts "XMLDECL"
|
|
- results = @source.match( XMLDECL_PATTERN, true )[1]
|
|
- version = VERSION.match( results )
|
|
- version = version[1] unless version.nil?
|
|
- encoding = ENCODING.match(results)
|
|
- encoding = encoding[1] unless encoding.nil?
|
|
- if need_source_encoding_update?(encoding)
|
|
- @source.encoding = encoding
|
|
- end
|
|
- if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
|
|
- encoding = "UTF-16"
|
|
- end
|
|
- standalone = STANDALONE.match(results)
|
|
- standalone = standalone[1] unless standalone.nil?
|
|
- return [ :xmldecl, version, encoding, standalone ]
|
|
- when INSTRUCTION_START
|
|
+ if @source.match("<?", true)
|
|
return process_instruction
|
|
- when DOCTYPE_START
|
|
- base_error_message = "Malformed DOCTYPE"
|
|
- @source.match(DOCTYPE_START, true)
|
|
- @nsstack.unshift(curr_ns=Set.new)
|
|
- name = parse_name(base_error_message)
|
|
- if @source.match(/\A\s*\[/um, true)
|
|
- id = [nil, nil, nil]
|
|
- @document_status = :in_doctype
|
|
- elsif @source.match(/\A\s*>/um, true)
|
|
- id = [nil, nil, nil]
|
|
- @document_status = :after_doctype
|
|
- else
|
|
- id = parse_id(base_error_message,
|
|
- accept_external_id: true,
|
|
- accept_public_id: false)
|
|
- if id[0] == "SYSTEM"
|
|
- # For backward compatibility
|
|
- id[1], id[2] = id[2], nil
|
|
+ elsif @source.match("<!", true)
|
|
+ if @source.match("--", true)
|
|
+ return [ :comment, @source.match(/(.*?)-->/um, true)[1] ]
|
|
+ elsif @source.match("DOCTYPE", true)
|
|
+ base_error_message = "Malformed DOCTYPE"
|
|
+ unless @source.match(/\s+/um, true)
|
|
+ if @source.match(">")
|
|
+ message = "#{base_error_message}: name is missing"
|
|
+ else
|
|
+ message = "#{base_error_message}: invalid name"
|
|
+ end
|
|
+ @source.string = "<!DOCTYPE" + @source.buffer
|
|
+ raise REXML::ParseException.new(message, @source)
|
|
end
|
|
- if @source.match(/\A\s*\[/um, true)
|
|
- @document_status = :in_doctype
|
|
- elsif @source.match(/\A\s*>/um, true)
|
|
+ @nsstack.unshift(curr_ns=Set.new)
|
|
+ name = parse_name(base_error_message)
|
|
+ if @source.match(/\s*\[/um, true)
|
|
+ id = [nil, nil, nil]
|
|
+ @document_status = :in_doctype
|
|
+ elsif @source.match(/\s*>/um, true)
|
|
+ id = [nil, nil, nil]
|
|
@document_status = :after_doctype
|
|
else
|
|
- message = "#{base_error_message}: garbage after external ID"
|
|
- raise REXML::ParseException.new(message, @source)
|
|
+ id = parse_id(base_error_message,
|
|
+ accept_external_id: true,
|
|
+ accept_public_id: false)
|
|
+ if id[0] == "SYSTEM"
|
|
+ # For backward compatibility
|
|
+ id[1], id[2] = id[2], nil
|
|
+ end
|
|
+ if @source.match(/\s*\[/um, true)
|
|
+ @document_status = :in_doctype
|
|
+ elsif @source.match(/\s*>/um, true)
|
|
+ @document_status = :after_doctype
|
|
+ else
|
|
+ message = "#{base_error_message}: garbage after external ID"
|
|
+ raise REXML::ParseException.new(message, @source)
|
|
+ end
|
|
end
|
|
- end
|
|
- args = [:start_doctype, name, *id]
|
|
- if @document_status == :after_doctype
|
|
- @source.match(/\A\s*/um, true)
|
|
- @stack << [ :end_doctype ]
|
|
- end
|
|
- return args
|
|
- when /\A\s+/
|
|
- else
|
|
- @document_status = :after_doctype
|
|
- if @source.encoding == "UTF-8"
|
|
- @source.buffer_encoding = ::Encoding::UTF_8
|
|
+ args = [:start_doctype, name, *id]
|
|
+ if @document_status == :after_doctype
|
|
+ @source.match(/\s*/um, true)
|
|
+ @stack << [ :end_doctype ]
|
|
+ end
|
|
+ return args
|
|
+ else
|
|
+ message = "Invalid XML"
|
|
+ raise REXML::ParseException.new(message, @source)
|
|
end
|
|
end
|
|
end
|
|
if @document_status == :in_doctype
|
|
- md = @source.match(/\A\s*(.*?>)/um)
|
|
- case md[1]
|
|
- when SYSTEMENTITY
|
|
- match = @source.match( SYSTEMENTITY, true )[1]
|
|
- return [ :externalentity, match ]
|
|
-
|
|
- when ELEMENTDECL_START
|
|
- return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
|
|
-
|
|
- when ENTITY_START
|
|
- match = [:entitydecl, *@source.match( ENTITYDECL, true ).captures.compact]
|
|
- ref = false
|
|
- if match[1] == '%'
|
|
- ref = true
|
|
- match.delete_at 1
|
|
- end
|
|
- # Now we have to sort out what kind of entity reference this is
|
|
- if match[2] == 'SYSTEM'
|
|
- # External reference
|
|
- match[3] = match[3][1..-2] # PUBID
|
|
- match.delete_at(4) if match.size > 4 # Chop out NDATA decl
|
|
- # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
|
|
- elsif match[2] == 'PUBLIC'
|
|
- # External reference
|
|
- match[3] = match[3][1..-2] # PUBID
|
|
- match[4] = match[4][1..-2] # HREF
|
|
- match.delete_at(5) if match.size > 5 # Chop out NDATA decl
|
|
- # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
|
|
- else
|
|
- match[2] = match[2][1..-2]
|
|
- match.pop if match.size == 4
|
|
- # match is [ :entity, name, value ]
|
|
- end
|
|
- match << '%' if ref
|
|
- return match
|
|
- when ATTLISTDECL_START
|
|
- md = @source.match( ATTLISTDECL_PATTERN, true )
|
|
- raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
|
|
- element = md[1]
|
|
- contents = md[0]
|
|
-
|
|
- pairs = {}
|
|
- values = md[0].scan( ATTDEF_RE )
|
|
- values.each do |attdef|
|
|
- unless attdef[3] == "#IMPLIED"
|
|
- attdef.compact!
|
|
- val = attdef[3]
|
|
- val = attdef[4] if val == "#FIXED "
|
|
- pairs[attdef[0]] = val
|
|
- if attdef[0] =~ /^xmlns:(.*)/
|
|
- @nsstack[0] << $1
|
|
- end
|
|
+ @source.match(/\s*/um, true) # skip spaces
|
|
+ if @source.match("<!", true)
|
|
+ if @source.match("ELEMENT", true)
|
|
+ md = @source.match(/(.*?)>/um, true)
|
|
+ raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
|
|
+ return [ :elementdecl, "<!ELEMENT" + md[1] ]
|
|
+ elsif @source.match("ENTITY", true)
|
|
+ match = [:entitydecl, *@source.match(ENTITYDECL_PATTERN, true).captures.compact]
|
|
+ ref = false
|
|
+ if match[1] == '%'
|
|
+ ref = true
|
|
+ match.delete_at 1
|
|
end
|
|
- end
|
|
- return [ :attlistdecl, element, pairs, contents ]
|
|
- when NOTATIONDECL_START
|
|
- base_error_message = "Malformed notation declaration"
|
|
- unless @source.match(/\A\s*<!NOTATION\s+/um, true)
|
|
- if @source.match(/\A\s*<!NOTATION\s*>/um)
|
|
- message = "#{base_error_message}: name is missing"
|
|
+ # Now we have to sort out what kind of entity reference this is
|
|
+ if match[2] == 'SYSTEM'
|
|
+ # External reference
|
|
+ match[3] = match[3][1..-2] # PUBID
|
|
+ match.delete_at(4) if match.size > 4 # Chop out NDATA decl
|
|
+ # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
|
|
+ elsif match[2] == 'PUBLIC'
|
|
+ # External reference
|
|
+ match[3] = match[3][1..-2] # PUBID
|
|
+ match[4] = match[4][1..-2] # HREF
|
|
+ match.delete_at(5) if match.size > 5 # Chop out NDATA decl
|
|
+ # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
|
|
else
|
|
- message = "#{base_error_message}: invalid declaration name"
|
|
+ match[2] = match[2][1..-2]
|
|
+ match.pop if match.size == 4
|
|
+ # match is [ :entity, name, value ]
|
|
end
|
|
- raise REXML::ParseException.new(message, @source)
|
|
- end
|
|
- name = parse_name(base_error_message)
|
|
- id = parse_id(base_error_message,
|
|
- accept_external_id: true,
|
|
- accept_public_id: true)
|
|
- unless @source.match(/\A\s*>/um, true)
|
|
- message = "#{base_error_message}: garbage before end >"
|
|
- raise REXML::ParseException.new(message, @source)
|
|
+ match << '%' if ref
|
|
+ return match
|
|
+ elsif @source.match("ATTLIST", true)
|
|
+ md = @source.match(ATTLISTDECL_END, true)
|
|
+ raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
|
|
+ element = md[1]
|
|
+ contents = md[0]
|
|
+
|
|
+ pairs = {}
|
|
+ values = md[0].scan( ATTDEF_RE )
|
|
+ values.each do |attdef|
|
|
+ unless attdef[3] == "#IMPLIED"
|
|
+ attdef.compact!
|
|
+ val = attdef[3]
|
|
+ val = attdef[4] if val == "#FIXED "
|
|
+ pairs[attdef[0]] = val
|
|
+ if attdef[0] =~ /^xmlns:(.*)/
|
|
+ @nsstack[0] << $1
|
|
+ end
|
|
+ end
|
|
+ end
|
|
+ return [ :attlistdecl, element, pairs, contents ]
|
|
+ elsif @source.match("NOTATION", true)
|
|
+ base_error_message = "Malformed notation declaration"
|
|
+ unless @source.match(/\s+/um, true)
|
|
+ if @source.match(">")
|
|
+ message = "#{base_error_message}: name is missing"
|
|
+ else
|
|
+ message = "#{base_error_message}: invalid name"
|
|
+ end
|
|
+ @source.string = " <!NOTATION" + @source.buffer
|
|
+ raise REXML::ParseException.new(message, @source)
|
|
+ end
|
|
+ name = parse_name(base_error_message)
|
|
+ id = parse_id(base_error_message,
|
|
+ accept_external_id: true,
|
|
+ accept_public_id: true)
|
|
+ unless @source.match(/\s*>/um, true)
|
|
+ message = "#{base_error_message}: garbage before end >"
|
|
+ raise REXML::ParseException.new(message, @source)
|
|
+ end
|
|
+ return [:notationdecl, name, *id]
|
|
+ elsif md = @source.match(/--(.*?)-->/um, true)
|
|
+ case md[1]
|
|
+ when /--/, /-\z/
|
|
+ raise REXML::ParseException.new("Malformed comment", @source)
|
|
+ end
|
|
+ return [ :comment, md[1] ] if md
|
|
end
|
|
- return [:notationdecl, name, *id]
|
|
- when DOCTYPE_END
|
|
+ elsif match = @source.match(/(%.*?;)\s*/um, true)
|
|
+ return [ :externalentity, match[1] ]
|
|
+ elsif @source.match(/\]\s*>/um, true)
|
|
@document_status = :after_doctype
|
|
- @source.match( DOCTYPE_END, true )
|
|
return [ :end_doctype ]
|
|
end
|
|
end
|
|
if @document_status == :after_doctype
|
|
- @source.match(/\A\s*/um, true)
|
|
+ @source.match(/\s*/um, true)
|
|
end
|
|
begin
|
|
- @source.read if @source.buffer.size<2
|
|
- if @source.buffer[0] == ?<
|
|
- if @source.buffer[1] == ?/
|
|
+ if @source.match("<", true)
|
|
+ if @source.match("/", true)
|
|
@nsstack.shift
|
|
last_tag = @tags.pop
|
|
- #md = @source.match_to_consume( '>', CLOSE_MATCH)
|
|
- md = @source.match( CLOSE_MATCH, true )
|
|
+ md = @source.match(CLOSE_PATTERN, true)
|
|
if md and !last_tag
|
|
message = "Unexpected top-level end tag (got '#{md[1]}')"
|
|
raise REXML::ParseException.new(message, @source)
|
|
end
|
|
if md.nil? or last_tag != md[1]
|
|
message = "Missing end tag for '#{last_tag}'"
|
|
- message << " (got '#{md[1]}')" if md
|
|
+ message += " (got '#{md[1]}')" if md
|
|
+ @source.string = "</" + @source.buffer if md.nil?
|
|
raise REXML::ParseException.new(message, @source)
|
|
end
|
|
return [ :end_element, last_tag ]
|
|
- elsif @source.buffer[1] == ?!
|
|
- md = @source.match(/\A(\s*[^>]*>)/um)
|
|
+ elsif @source.match("!", true)
|
|
+ md = @source.match(/([^>]*>)/um)
|
|
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
|
|
raise REXML::ParseException.new("Malformed node", @source) unless md
|
|
- if md[0][2] == ?-
|
|
- md = @source.match( COMMENT_PATTERN, true )
|
|
+ if md[0][0] == ?-
|
|
+ md = @source.match(/--(.*?)-->/um, true)
|
|
|
|
case md[1]
|
|
when /--/, /-\z/
|
|
@@ -388,19 +393,18 @@ def pull_event
|
|
|
|
return [ :comment, md[1] ] if md
|
|
else
|
|
- md = @source.match( CDATA_PATTERN, true )
|
|
+ md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
|
|
return [ :cdata, md[1] ] if md
|
|
end
|
|
raise REXML::ParseException.new( "Declarations can only occur "+
|
|
"in the doctype declaration.", @source)
|
|
- elsif @source.buffer[1] == ??
|
|
+ elsif @source.match("?", true)
|
|
return process_instruction
|
|
else
|
|
# Get the next tag
|
|
- md = @source.match(TAG_MATCH, true)
|
|
+ md = @source.match(TAG_PATTERN, true)
|
|
unless md
|
|
- # Check for missing attribute quotes
|
|
- raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES )
|
|
+ @source.string = "<" + @source.buffer
|
|
raise REXML::ParseException.new("malformed XML: missing tag start", @source)
|
|
end
|
|
tag = md[1]
|
|
@@ -425,7 +429,7 @@ def pull_event
|
|
return [ :start_element, tag, attributes ]
|
|
end
|
|
else
|
|
- md = @source.match( TEXT_PATTERN, true )
|
|
+ md = @source.match(/([^<]*)/um, true)
|
|
text = md[1]
|
|
if md[0].length == 0
|
|
@source.match( /(\s+)/, true )
|
|
@@ -472,8 +476,7 @@ def normalize( input, entities=nil, entity_filter=nil )
|
|
|
|
# Unescapes all possible entities
|
|
def unnormalize( string, entities=nil, filter=nil )
|
|
- rv = string.clone
|
|
- rv.gsub!( /\r\n?/, "\n" )
|
|
+ rv = string.gsub( /\r\n?/, "\n" )
|
|
matches = rv.scan( REFERENCE_RE )
|
|
return rv if matches.size == 0
|
|
rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
|
|
@@ -508,9 +511,9 @@ def need_source_encoding_update?(xml_declaration_encoding)
|
|
end
|
|
|
|
def parse_name(base_error_message)
|
|
- md = @source.match(/\A\s*#{NAME}/um, true)
|
|
+ md = @source.match(NAME_PATTERN, true)
|
|
unless md
|
|
- if @source.match(/\A\s*\S/um)
|
|
+ if @source.match(/\s*\S/um)
|
|
message = "#{base_error_message}: invalid name"
|
|
else
|
|
message = "#{base_error_message}: name is missing"
|
|
@@ -671,12 +674,29 @@ def parse_attributes(prefixes, curr_ns)
|
|
end
|
|
|
|
def process_instruction
|
|
- match_data = @source.match(INSTRUCTION_PATTERN, true)
|
|
+ match_data = @source.match(INSTRUCTION_END, true)
|
|
unless match_data
|
|
message = "Invalid processing instruction node"
|
|
+ @source.string = "<?" + @source.buffer
|
|
raise REXML::ParseException.new(message, @source)
|
|
end
|
|
- [:processing_instruction, match_data[1], match_data[4]]
|
|
+ if @document_status.nil? and match_data[1] == "xml"
|
|
+ content = match_data[2]
|
|
+ version = VERSION.match(content)
|
|
+ version = version[1] unless version.nil?
|
|
+ encoding = ENCODING.match(content)
|
|
+ encoding = encoding[1] unless encoding.nil?
|
|
+ if need_source_encoding_update?(encoding)
|
|
+ @source.encoding = encoding
|
|
+ end
|
|
+ if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
|
|
+ encoding = "UTF-16"
|
|
+ end
|
|
+ standalone = STANDALONE.match(content)
|
|
+ standalone = standalone[1] unless standalone.nil?
|
|
+ return [ :xmldecl, version, encoding, standalone ]
|
|
+ end
|
|
+ [:processing_instruction, match_data[1], match_data[2]]
|
|
end
|
|
end
|
|
end
|
|
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
|
|
index 4624777fa0..d2a6ad872e 100644
|
|
--- a/lib/rexml/source.rb
|
|
+++ b/lib/rexml/source.rb
|
|
@@ -76,6 +76,10 @@ def match(pattern, cons=false)
|
|
end
|
|
end
|
|
|
|
+ def string=(string)
|
|
+ @scanner.string = string
|
|
+ end
|
|
+
|
|
# @return true if the Source is exhausted
|
|
def empty?
|
|
@scanner.eos?
|
|
@@ -150,28 +154,25 @@ def initialize(arg, block_size=500, encoding=nil)
|
|
def read
|
|
begin
|
|
@scanner << readline
|
|
+ true
|
|
rescue Exception, NameError
|
|
@source = nil
|
|
+ false
|
|
end
|
|
end
|
|
|
|
def match( pattern, cons=false )
|
|
- if cons
|
|
- md = @scanner.scan(pattern)
|
|
- else
|
|
- md = @scanner.check(pattern)
|
|
- end
|
|
- while md.nil? and @source
|
|
- begin
|
|
- @scanner << readline
|
|
- if cons
|
|
- md = @scanner.scan(pattern)
|
|
- else
|
|
- md = @scanner.check(pattern)
|
|
- end
|
|
- rescue
|
|
- @source = nil
|
|
+ read if @scanner.eos? && @source
|
|
+ while true
|
|
+ if cons
|
|
+ md = @scanner.scan(pattern)
|
|
+ else
|
|
+ md = @scanner.check(pattern)
|
|
end
|
|
+ break if md
|
|
+ return nil if pattern.is_a?(String) && pattern.bytesize <= @scanner.rest_size
|
|
+ return nil if @source.nil?
|
|
+ return nil unless read
|
|
end
|
|
|
|
md.nil? ? nil : @scanner
|
|
diff --git a/test/rexml/parse/test_document_type_declaration.rb b/test/rexml/parse/test_document_type_declaration.rb
|
|
index 55713909e7..8faa0b78be 100644
|
|
--- a/test/rexml/parse/test_document_type_declaration.rb
|
|
+++ b/test/rexml/parse/test_document_type_declaration.rb
|
|
@@ -36,6 +36,21 @@ def test_garbage_plus_before_name_at_line_start
|
|
+ r SYSTEM "urn:x-rexml:test" [ ]> <r/>
|
|
DETAIL
|
|
end
|
|
+
|
|
+ def test_no_name
|
|
+ exception = assert_raise(REXML::ParseException) do
|
|
+ parse(<<-DOCTYPE)
|
|
+<!DOCTYPE>
|
|
+ DOCTYPE
|
|
+ end
|
|
+ assert_equal(<<-DETAIL.chomp, exception.to_s)
|
|
+Malformed DOCTYPE: name is missing
|
|
+Line: 3
|
|
+Position: 17
|
|
+Last 80 unconsumed characters:
|
|
+<!DOCTYPE> <r/>
|
|
+ DETAIL
|
|
+ end
|
|
end
|
|
|
|
class TestExternalID < self
|
|
|
|
From 0496940d5998ccbc50d16fb734993ab50fc60c2d Mon Sep 17 00:00:00 2001
|
|
From: NAITOH Jun <naitoh@gmail.com>
|
|
Date: Mon, 18 Mar 2024 23:30:47 +0900
|
|
Subject: [PATCH 5/7] Optimize the parse_attributes method to use
|
|
`Source#match` to parse XML. (#119)
|
|
|
|
## Why?
|
|
|
|
Improve maintainability by consolidating processing into `Source#match`.
|
|
|
|
## Benchmark
|
|
```
|
|
RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml
|
|
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22]
|
|
Calculating -------------------------------------
|
|
before after before(YJIT) after(YJIT)
|
|
dom 10.891 10.622 16.356 17.403 i/s - 100.000 times in 9.182130s 9.414177s 6.113806s 5.746133s
|
|
sax 30.335 29.845 49.749 54.877 i/s - 100.000 times in 3.296483s 3.350595s 2.010071s 1.822259s
|
|
pull 35.514 34.801 61.123 66.908 i/s - 100.000 times in 2.815793s 2.873484s 1.636041s 1.494591s
|
|
stream 35.141 34.475 52.110 56.836 i/s - 100.000 times in 2.845646s 2.900638s 1.919017s 1.759456s
|
|
|
|
Comparison:
|
|
dom
|
|
after(YJIT): 17.4 i/s
|
|
before(YJIT): 16.4 i/s - 1.06x slower
|
|
before: 10.9 i/s - 1.60x slower
|
|
after: 10.6 i/s - 1.64x slower
|
|
|
|
sax
|
|
after(YJIT): 54.9 i/s
|
|
before(YJIT): 49.7 i/s - 1.10x slower
|
|
before: 30.3 i/s - 1.81x slower
|
|
after: 29.8 i/s - 1.84x slower
|
|
|
|
pull
|
|
after(YJIT): 66.9 i/s
|
|
before(YJIT): 61.1 i/s - 1.09x slower
|
|
before: 35.5 i/s - 1.88x slower
|
|
after: 34.8 i/s - 1.92x slower
|
|
|
|
stream
|
|
after(YJIT): 56.8 i/s
|
|
before(YJIT): 52.1 i/s - 1.09x slower
|
|
before: 35.1 i/s - 1.62x slower
|
|
after: 34.5 i/s - 1.65x slower
|
|
|
|
```
|
|
|
|
- YJIT=ON : 1.06x - 1.10x faster
|
|
- YJIT=OFF : 0.97x - 0.98x faster
|
|
===
|
|
"test_attribute_namespace_conflict" in test/rexml/test_core.rb
|
|
adjustment not applied as the commit that added the test is not present.
|
|
|
|
Wider use of Source#match to parse XML.
|
|
|
|
Changes the way BaseParser#parse_attribute
|
|
grabs fields that make up an XML attribute.
|
|
|
|
This is relied upon by upstream 4325835f92f3f142ebd91a3fdba4e1f1ab7f1cfb
|
|
that actually fixes CVE-2024-35176.
|
|
|
|
https://github.com/ruby/rexml/commit/0496940d5998ccbc50d16fb734993ab50fc60c2d
|
|
---
|
|
lib/rexml/parsers/baseparser.rb | 113 ++++++++++++-------------------
|
|
test/rexml/parse/test_element.rb | 2 +-
|
|
test/rexml/test_core.rb | 17 ++++-
|
|
3 files changed, 61 insertions(+), 71 deletions(-)
|
|
|
|
diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
|
|
index dafc1b1449..45eabe8cc5 100644
|
|
--- a/lib/rexml/parsers/baseparser.rb
|
|
+++ b/lib/rexml/parsers/baseparser.rb
|
|
@@ -123,7 +123,7 @@ class BaseParser
|
|
|
|
module Private
|
|
INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
|
|
- TAG_PATTERN = /((?>#{QNAME_STR}))/um
|
|
+ TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
|
|
CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
|
|
ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
|
|
NAME_PATTERN = /\s*#{NAME}/um
|
|
@@ -592,85 +592,60 @@ def parse_id_invalid_details(accept_external_id:,
|
|
def parse_attributes(prefixes, curr_ns)
|
|
attributes = {}
|
|
closed = false
|
|
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
|
|
- if match_data.nil?
|
|
- message = "Start tag isn't ended"
|
|
- raise REXML::ParseException.new(message, @source)
|
|
- end
|
|
-
|
|
- raw_attributes = match_data[1]
|
|
- closed = !match_data[2].nil?
|
|
- return attributes, closed if raw_attributes.nil?
|
|
- return attributes, closed if raw_attributes.empty?
|
|
-
|
|
- scanner = StringScanner.new(raw_attributes)
|
|
- until scanner.eos?
|
|
- if scanner.scan(/\s+/)
|
|
- break if scanner.eos?
|
|
- end
|
|
-
|
|
- pos = scanner.pos
|
|
- loop do
|
|
- break if scanner.scan(ATTRIBUTE_PATTERN)
|
|
- unless scanner.scan(QNAME)
|
|
- message = "Invalid attribute name: <#{scanner.rest}>"
|
|
- raise REXML::ParseException.new(message, @source)
|
|
- end
|
|
- name = scanner[0]
|
|
- unless scanner.scan(/\s*=\s*/um)
|
|
+ while true
|
|
+ if @source.match(">", true)
|
|
+ return attributes, closed
|
|
+ elsif @source.match("/>", true)
|
|
+ closed = true
|
|
+ return attributes, closed
|
|
+ elsif match = @source.match(QNAME, true)
|
|
+ name = match[1]
|
|
+ prefix = match[2]
|
|
+ local_part = match[3]
|
|
+
|
|
+ unless @source.match(/\s*=\s*/um, true)
|
|
message = "Missing attribute equal: <#{name}>"
|
|
raise REXML::ParseException.new(message, @source)
|
|
end
|
|
- quote = scanner.scan(/['"]/)
|
|
- unless quote
|
|
- message = "Missing attribute value start quote: <#{name}>"
|
|
- raise REXML::ParseException.new(message, @source)
|
|
- end
|
|
- unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
|
|
- match_data = @source.match(/^(.*?)(\/)?>/um, true)
|
|
- if match_data
|
|
- scanner << "/" if closed
|
|
- scanner << ">"
|
|
- scanner << match_data[1]
|
|
- scanner.pos = pos
|
|
- closed = !match_data[2].nil?
|
|
- next
|
|
+ unless match = @source.match(/(['"])(.*?)\1\s*/um, true)
|
|
+ if match = @source.match(/(['"])/, true)
|
|
+ message =
|
|
+ "Missing attribute value end quote: <#{name}>: <#{match[1]}>"
|
|
+ raise REXML::ParseException.new(message, @source)
|
|
+ else
|
|
+ message = "Missing attribute value start quote: <#{name}>"
|
|
+ raise REXML::ParseException.new(message, @source)
|
|
end
|
|
- message =
|
|
- "Missing attribute value end quote: <#{name}>: <#{quote}>"
|
|
- raise REXML::ParseException.new(message, @source)
|
|
end
|
|
- end
|
|
- name = scanner[1]
|
|
- prefix = scanner[2]
|
|
- local_part = scanner[3]
|
|
- # quote = scanner[4]
|
|
- value = scanner[5]
|
|
- if prefix == "xmlns"
|
|
- if local_part == "xml"
|
|
- if value != "http://www.w3.org/XML/1998/namespace"
|
|
- msg = "The 'xml' prefix must not be bound to any other namespace "+
|
|
+ value = match[2]
|
|
+ if prefix == "xmlns"
|
|
+ if local_part == "xml"
|
|
+ if value != "http://www.w3.org/XML/1998/namespace"
|
|
+ msg = "The 'xml' prefix must not be bound to any other namespace "+
|
|
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
|
+ raise REXML::ParseException.new( msg, @source, self )
|
|
+ end
|
|
+ elsif local_part == "xmlns"
|
|
+ msg = "The 'xmlns' prefix must not be declared "+
|
|
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
|
- raise REXML::ParseException.new( msg, @source, self )
|
|
+ raise REXML::ParseException.new( msg, @source, self)
|
|
end
|
|
- elsif local_part == "xmlns"
|
|
- msg = "The 'xmlns' prefix must not be declared "+
|
|
- "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
|
- raise REXML::ParseException.new( msg, @source, self)
|
|
+ curr_ns << local_part
|
|
+ elsif prefix
|
|
+ prefixes << prefix unless prefix == "xml"
|
|
end
|
|
- curr_ns << local_part
|
|
- elsif prefix
|
|
- prefixes << prefix unless prefix == "xml"
|
|
- end
|
|
|
|
- if attributes.has_key?(name)
|
|
- msg = "Duplicate attribute #{name.inspect}"
|
|
- raise REXML::ParseException.new(msg, @source, self)
|
|
- end
|
|
+ if attributes.has_key?(name)
|
|
+ msg = "Duplicate attribute #{name.inspect}"
|
|
+ raise REXML::ParseException.new(msg, @source, self)
|
|
+ end
|
|
|
|
- attributes[name] = value
|
|
+ attributes[name] = value
|
|
+ else
|
|
+ message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
|
|
+ raise REXML::ParseException.new(message, @source)
|
|
+ end
|
|
end
|
|
- return attributes, closed
|
|
end
|
|
|
|
def process_instruction
|
|
diff --git a/test/rexml/parse/test_element.rb b/test/rexml/parse/test_element.rb
|
|
index e8dce4b997..987214f3bb 100644
|
|
--- a/test/rexml/parse/test_element.rb
|
|
+++ b/test/rexml/parse/test_element.rb
|
|
@@ -43,7 +43,7 @@ def test_empty_namespace_attribute_name
|
|
Line: 1
|
|
Position: 13
|
|
Last 80 unconsumed characters:
|
|
-
|
|
+:a=""></x>
|
|
DETAIL
|
|
end
|
|
|
|
diff --git a/test/rexml/test_core.rb b/test/rexml/test_core.rb
|
|
index d4ece491b9..46de90ccd7 100644
|
|
--- a/test/rexml/test_core.rb
|
|
+++ b/test/rexml/test_core.rb
|
|
@@ -1277,11 +1277,26 @@ def test_ticket_21
|
|
exception = assert_raise(ParseException) do
|
|
Document.new(src)
|
|
end
|
|
- assert_equal(<<-DETAIL, exception.to_s)
|
|
+ assert_equal(<<-DETAIL.chomp, exception.to_s)
|
|
Missing attribute value start quote: <bar>
|
|
Line: 1
|
|
Position: 16
|
|
Last 80 unconsumed characters:
|
|
+value/>
|
|
+ DETAIL
|
|
+ end
|
|
+
|
|
+ def test_parse_exception_on_missing_attribute_end_quote
|
|
+ src = '<foo bar="value/>'
|
|
+ exception = assert_raise(ParseException) do
|
|
+ Document.new(src)
|
|
+ end
|
|
+ assert_equal(<<-DETAIL.chomp, exception.to_s)
|
|
+Missing attribute value end quote: <bar>: <">
|
|
+Line: 1
|
|
+Position: 17
|
|
+Last 80 unconsumed characters:
|
|
+value/>
|
|
DETAIL
|
|
end
|
|
|
|
|
|
From 4325835f92f3f142ebd91a3fdba4e1f1ab7f1cfb Mon Sep 17 00:00:00 2001
|
|
From: Nobuyoshi Nakada <nobu@ruby-lang.org>
|
|
Date: Thu, 16 May 2024 11:26:51 +0900
|
|
Subject: [PATCH 6/7] Read quoted attributes in chunks (#126)
|
|
|
|
===
|
|
test/rexml/test_document.rb:
|
|
* test_gt_linear_performance
|
|
Test not included, Ruby 2.5 does not have assert_linear_performance
|
|
method available in the test suite. It was introduced in Ruby 2.7.
|
|
* Do not require 'core_assertions'
|
|
Current requires are fine for running the tests in mock build.
|
|
|
|
The fix for CVE-2024-35176. By reading the field in chunks instead of 1
|
|
by 1 it improves performance for large strings composed of angle
|
|
brackets.
|
|
|
|
https://github.com/ruby/rexml/commit/4325835f92f3f142ebd91a3fdba4e1f1ab7f1cfb
|
|
---
|
|
lib/rexml/parsers/baseparser.rb | 20 ++++++++++----------
|
|
lib/rexml/source.rb | 29 ++++++++++++++++++++++++-----
|
|
2 files changed, 34 insertions(+), 15 deletions(-)
|
|
|
|
diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
|
|
index 45eabe8cc5..304835e407 100644
|
|
--- a/lib/rexml/parsers/baseparser.rb
|
|
+++ b/lib/rexml/parsers/baseparser.rb
|
|
@@ -607,17 +607,17 @@ def parse_attributes(prefixes, curr_ns)
|
|
message = "Missing attribute equal: <#{name}>"
|
|
raise REXML::ParseException.new(message, @source)
|
|
end
|
|
- unless match = @source.match(/(['"])(.*?)\1\s*/um, true)
|
|
- if match = @source.match(/(['"])/, true)
|
|
- message =
|
|
- "Missing attribute value end quote: <#{name}>: <#{match[1]}>"
|
|
- raise REXML::ParseException.new(message, @source)
|
|
- else
|
|
- message = "Missing attribute value start quote: <#{name}>"
|
|
- raise REXML::ParseException.new(message, @source)
|
|
- end
|
|
+ unless match = @source.match(/(['"])/, true)
|
|
+ message = "Missing attribute value start quote: <#{name}>"
|
|
+ raise REXML::ParseException.new(message, @source)
|
|
+ end
|
|
+ quote = match[1]
|
|
+ value = @source.read_until(quote)
|
|
+ unless value.chomp!(quote)
|
|
+ message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
|
|
+ raise REXML::ParseException.new(message, @source)
|
|
end
|
|
- value = match[2]
|
|
+ @source.match(/\s*/um, true)
|
|
if prefix == "xmlns"
|
|
if local_part == "xml"
|
|
if value != "http://www.w3.org/XML/1998/namespace"
|
|
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
|
|
index d2a6ad872e..2a3bd36695 100644
|
|
--- a/lib/rexml/source.rb
|
|
+++ b/lib/rexml/source.rb
|
|
@@ -65,7 +65,11 @@ def encoding=(enc)
|
|
encoding_updated
|
|
end
|
|
|
|
- def read
|
|
+ def read(term = nil)
|
|
+ end
|
|
+
|
|
+ def read_until(term)
|
|
+ @scanner.scan_until(Regexp.union(term)) or @scanner.rest
|
|
end
|
|
|
|
def match(pattern, cons=false)
|
|
@@ -151,9 +155,9 @@ def initialize(arg, block_size=500, encoding=nil)
|
|
end
|
|
end
|
|
|
|
- def read
|
|
+ def read(term = nil)
|
|
begin
|
|
- @scanner << readline
|
|
+ @scanner << readline(term)
|
|
true
|
|
rescue Exception, NameError
|
|
@source = nil
|
|
@@ -161,6 +165,21 @@ def read
|
|
end
|
|
end
|
|
|
|
+ def read_until(term)
|
|
+ pattern = Regexp.union(term)
|
|
+ data = []
|
|
+ begin
|
|
+ until str = @scanner.scan_until(pattern)
|
|
+ @scanner << readline(term)
|
|
+ end
|
|
+ rescue EOFError
|
|
+ @scanner.rest
|
|
+ else
|
|
+ read if @scanner.eos? and !@source.eof?
|
|
+ str
|
|
+ end
|
|
+ end
|
|
+
|
|
def match( pattern, cons=false )
|
|
read if @scanner.eos? && @source
|
|
while true
|
|
@@ -204,8 +223,8 @@ def current_line
|
|
end
|
|
|
|
private
|
|
- def readline
|
|
- str = @source.readline(@line_break)
|
|
+ def readline(term = nil)
|
|
+ str = @source.readline(term || @line_break)
|
|
if @pending_buffer
|
|
if str.nil?
|
|
str = @pending_buffer
|
|
|
|
From f1df7d13b3e57a5e059273d2f0870163c08d7420 Mon Sep 17 00:00:00 2001
|
|
From: Sutou Kouhei <kou@clear-code.com>
|
|
Date: Mon, 20 May 2024 12:17:27 +0900
|
|
Subject: [PATCH 7/7] Add support for old strscan
|
|
|
|
Fix GH-132
|
|
|
|
If we support old strscan, users can also use strscan installed as a
|
|
default gem.
|
|
|
|
Reported by Adam. Thanks!!!
|
|
|
|
===
|
|
strscan shipped with Ruby 2.5 lacks a method StringScanner#captures.
|
|
ruby/rexml#105 brought this dependency on the method.
|
|
|
|
https://bugs.ruby-lang.org/issues/20516#note-11
|
|
https://github.com/ruby/rexml/commit/f1df7d13b3e57a5e059273d2f0870163c08d7420
|
|
---
|
|
lib/rexml/parsers/baseparser.rb | 11 +++++++++++
|
|
1 file changed, 11 insertions(+)
|
|
|
|
diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
|
|
index 304835e407..006d8acb9b 100644
|
|
--- a/lib/rexml/parsers/baseparser.rb
|
|
+++ b/lib/rexml/parsers/baseparser.rb
|
|
@@ -9,6 +9,17 @@
|
|
|
|
module REXML
|
|
module Parsers
|
|
+ if StringScanner::Version < "3.0.8"
|
|
+ module StringScannerCaptures
|
|
+ refine StringScanner do
|
|
+ def captures
|
|
+ values_at(*(1...size))
|
|
+ end
|
|
+ end
|
|
+ end
|
|
+ using StringScannerCaptures
|
|
+ end
|
|
+
|
|
# = Using the Pull Parser
|
|
# <em>This API is experimental, and subject to change.</em>
|
|
# parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
|