From 694239f0855668c986feba6f1b395ecd94a1f0bc Mon Sep 17 00:00:00 2001
From: Kouhei Sutou <kou@clear-code.com>
Date: Fri, 28 Sep 2018 16:29:17 +0900
Subject: [PATCH 1/7] Make PI parsing more robust

PI target must be "NAME - (('X' | 'x') ('M' | 'm') ('L' | 'l'))". So
'<?pos="3"?>' is invalid because "=" isn't included in NAME.

Without this change, 'pos="3"' was accepted as PI target.

===
Commit implements #process_instruction method for baseparser
that later patches rely upon.

https://github.com/ruby/rexml/commit/694239f0855668c986feba6f1b395ecd94a1f0bc
---
 lib/rexml/parsers/baseparser.rb               | 18 ++++++++-----
 test/rexml/data/t75.xml                       |  2 +-
 .../parse/test_processing_instruction.rb      | 25 +++++++++++++++++++
 3 files changed, 38 insertions(+), 7 deletions(-)
 create mode 100644 test/rexml/parse/test_processing_instruction.rb
diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index e7ef695912..84a0bcdcec 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -61,7 +61,7 @@ class BaseParser
       XMLDECL_START = /\A<\?xml\s/u;
       XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
       INSTRUCTION_START = /\A<\?/u
-      INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um
+      INSTRUCTION_PATTERN = /<\?#{QNAME}(\s+.*?)?\?>/um
       TAG_MATCH = /\A<((?>#{QNAME_STR}))/um
       CLOSE_MATCH = /\A\s*<\/(#{QNAME_STR})\s*>/um
 
@@ -229,7 +229,7 @@ def pull_event
             standalone = standalone[1] unless standalone.nil?
             return [ :xmldecl, version, encoding, standalone ]
           when INSTRUCTION_START
-            return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ]
+            return process_instruction
           when DOCTYPE_START
             base_error_message = "Malformed DOCTYPE"
             @source.match(DOCTYPE_START, true)
@@ -395,10 +395,7 @@ def pull_event
               raise REXML::ParseException.new( "Declarations can only occur "+
                 "in the doctype declaration.", @source)
             elsif @source.buffer[1] == ??
-              md = @source.match( INSTRUCTION_PATTERN, true )
-              return [ :processing_instruction, md[1], md[2] ] if md
-              raise REXML::ParseException.new( "Bad instruction declaration",
-                @source)
+              return process_instruction
             else
               # Get the next tag
               md = @source.match(TAG_MATCH, true)
@@ -675,6 +672,15 @@ def parse_attributes(prefixes, curr_ns)
         end
         return attributes, closed
       end
+
+      def process_instruction
+        match_data = @source.match(INSTRUCTION_PATTERN, true)
+        unless match_data
+          message = "Invalid processing instruction node"
+          raise REXML::ParseException.new(message, @source)
+        end
+        [:processing_instruction, match_data[1], match_data[4]]
+      end
     end
   end
 end
diff --git a/test/rexml/data/t75.xml b/test/rexml/data/t75.xml
index 0911fb1b1a..eb3cccee4b 100644
--- a/test/rexml/data/t75.xml
+++ b/test/rexml/data/t75.xml
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="ISO-8859-1"?><?pos="3"?>
+<?xml version="1.0" encoding="ISO-8859-1"?>
 <!-- generated by hnb 1.9.17 (http://hnb.sourceforge.net) -->
 
 <!DOCTYPE tree[
diff --git a/test/rexml/parse/test_processing_instruction.rb b/test/rexml/parse/test_processing_instruction.rb
new file mode 100644
index 0000000000..a23513fc6e
--- /dev/null
+++ b/test/rexml/parse/test_processing_instruction.rb
@@ -0,0 +1,25 @@
+require "test/unit"
+require "rexml/document"
+
+module REXMLTests
+  class TestParseProcessinInstruction < Test::Unit::TestCase
+    def parse(xml)
+      REXML::Document.new(xml)
+    end
+
+    class TestInvalid < self
+      def test_no_name
+        exception = assert_raise(REXML::ParseException) do
+          parse("<??>")
+        end
+        assert_equal(<<-DETAIL.chomp, exception.to_s)
+Invalid processing instruction node
+Line: 1
+Position: 4
+Last 80 unconsumed characters:
+<??>
+        DETAIL
+      end
+    end
+  end
+end

From 810d2285235d5501a0a124f300832e6e9515da3c Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Wed, 17 Jan 2024 15:32:57 +0900
Subject: [PATCH 2/7] Use string scanner with baseparser (#105)

Using StringScanner reduces the string copying process and speeds up the
process.

And I removed unnecessary methods.

https://github.com/ruby/rexml/actions/runs/7549990000/job/20554906140?pr=105

```
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [x86_64-linux]
Calculating -------------------------------------
                     rexml 3.2.6      master  3.2.6(YJIT)  master(YJIT)
                 dom       4.868       5.077        8.137         8.303 i/s -     100.000 times in 20.540529s 19.696590s 12.288900s 12.043666s
                 sax      13.597      13.953       19.206        20.948 i/s -     100.000 times in 7.354343s 7.167142s 5.206745s 4.773765s
                pull      15.641      16.918       22.266        25.378 i/s -     100.000 times in 6.393424s 5.910955s 4.491201s 3.940471s
              stream      14.339      15.844       19.810        22.206 i/s -     100.000 times in 6.973856s 6.311350s 5.047957s 4.503244s

Comparison:
                              dom
        master(YJIT):         8.3 i/s
         3.2.6(YJIT):         8.1 i/s - 1.02x  slower
              master:         5.1 i/s - 1.64x  slower
         rexml 3.2.6:         4.9 i/s - 1.71x  slower

                              sax
        master(YJIT):        20.9 i/s
         3.2.6(YJIT):        19.2 i/s - 1.09x  slower
              master:        14.0 i/s - 1.50x  slower
         rexml 3.2.6:        13.6 i/s - 1.54x  slower

                             pull
        master(YJIT):        25.4 i/s
         3.2.6(YJIT):        22.3 i/s - 1.14x  slower
              master:        16.9 i/s - 1.50x  slower
         rexml 3.2.6:        15.6 i/s - 1.62x  slower

                           stream
        master(YJIT):        22.2 i/s
         3.2.6(YJIT):        19.8 i/s - 1.12x  slower
              master:        15.8 i/s - 1.40x  slower
         rexml 3.2.6:        14.3 i/s - 1.55x  slower
```

- YJIT=ON : 1.02x - 1.14x faster
- YJIT=OFF : 1.02x - 1.10x faster

---------

Co-authored-by: Sutou Kouhei <kou@cozmixng.org>

===
The PR focuses on enhancements in lib/rexml/source.rb.
StringScanner is used instead of just a string for scanning and
matching.
This behavior is relied upon in later patches.

Preparation for fix for CVE-2024-35176.

https://github.com/ruby/rexml/commit/810d2285235d5501a0a124f300832e6e9515da3c
---
 lib/rexml/parsers/baseparser.rb             |  21 ++-
 lib/rexml/source.rb                         | 149 ++++++--------------
 test/rexml/parse/test_entity_declaration.rb |  36 +++++
 test/rexml/test_core.rb                     |   2 +-
 4 files changed, 93 insertions(+), 115 deletions(-)
 create mode 100644 test/rexml/parse/test_entity_declaration.rb

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 84a0bcdcec..c29783af5c 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -98,7 +98,7 @@ class BaseParser
       ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
       PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
       GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
-      ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
+      ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
 
       NOTATIONDECL_START = /\A\s*<!NOTATION/um
       EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
@@ -268,7 +268,7 @@ def pull_event
           else
             @document_status = :after_doctype
             if @source.encoding == "UTF-8"
-              @source.buffer.force_encoding(::Encoding::UTF_8)
+              @source.buffer_encoding = ::Encoding::UTF_8
             end
           end
         end
@@ -283,8 +283,7 @@ def pull_event
             return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
 
           when ENTITY_START
-            match = @source.match( ENTITYDECL, true ).to_a.compact
-            match[0] = :entitydecl
+            match = [:entitydecl, *@source.match( ENTITYDECL, true ).captures.compact]
             ref = false
             if match[1] == '%'
               ref = true
@@ -404,6 +403,7 @@ def pull_event
                 raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES )
                 raise REXML::ParseException.new("malformed XML: missing tag start", @source)
               end
+              tag = md[1]
               @document_status = :in_element
               prefixes = Set.new
               prefixes << md[2] if md[2]
@@ -417,23 +417,20 @@ def pull_event
               end
 
               if closed
-                @closed = md[1]
+                @closed = tag
                 @nsstack.shift
               else
-                @tags.push( md[1] )
+                @tags.push( tag )
               end
-              return [ :start_element, md[1], attributes ]
+              return [ :start_element, tag, attributes ]
             end
           else
             md = @source.match( TEXT_PATTERN, true )
+            text = md[1]
             if md[0].length == 0
               @source.match( /(\s+)/, true )
             end
-            #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
-            #return [ :text, "" ] if md[0].length == 0
-            # unnormalized = Text::unnormalize( md[1], self )
-            # return PullEvent.new( :text, md[1], unnormalized )
-            return [ :text, md[1] ]
+            return [ :text, text ]
           end
         rescue REXML::UndefinedNamespaceException
           raise
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index af65cf4751..9883e79210 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -30,8 +30,6 @@ def SourceFactory::create_from(arg)
   # objects and provides consumption of text
   class Source
     include Encoding
-    # The current buffer (what we're going to read next)
-    attr_reader :buffer
     # The line number of the last consumed text
     attr_reader :line
     attr_reader :encoding
@@ -41,7 +39,8 @@ class Source
     # @param encoding if non-null, sets the encoding of the source to this
     # value, overriding all encoding detection
     def initialize(arg, encoding=nil)
-      @orig = @buffer = arg
+      @orig = arg
+      @scanner = StringScanner.new(@orig)
       if encoding
         self.encoding = encoding
       else
@@ -50,6 +49,14 @@ def initialize(arg, encoding=nil)
       @line = 0
     end
 
+    # The current buffer (what we're going to read next)
+    def buffer
+      @scanner.rest
+    end
+
+    def buffer_encoding=(encoding)
+      @scanner.string.force_encoding(encoding)
+    end
 
     # Inherited from Encoding
     # Overridden to support optimized en/decoding
@@ -58,98 +65,57 @@ def encoding=(enc)
       encoding_updated
     end
 
-    # Scans the source for a given pattern.  Note, that this is not your
-    # usual scan() method.  For one thing, the pattern argument has some
-    # requirements; for another, the source can be consumed.  You can easily
-    # confuse this method.  Originally, the patterns were easier
-    # to construct and this method more robust, because this method
-    # generated search regexps on the fly; however, this was
-    # computationally expensive and slowed down the entire REXML package
-    # considerably, since this is by far the most commonly called method.
-    # @param pattern must be a Regexp, and must be in the form of
-    # /^\s*(#{your pattern, with no groups})(.*)/.  The first group
-    # will be returned; the second group is used if the consume flag is
-    # set.
-    # @param consume if true, the pattern returned will be consumed, leaving
-    # everything after it in the Source.
-    # @return the pattern, if found, or nil if the Source is empty or the
-    # pattern is not found.
-    def scan(pattern, cons=false)
-      return nil if @buffer.nil?
-      rv = @buffer.scan(pattern)
-      @buffer = $' if cons and rv.size>0
-      rv
-    end
-
     def read
     end
 
-    def consume( pattern )
-      @buffer = $' if pattern.match( @buffer )
-    end
-
-    def match_to( char, pattern )
-      return pattern.match(@buffer)
-    end
-
-    def match_to_consume( char, pattern )
-      md = pattern.match(@buffer)
-      @buffer = $'
-      return md
-    end
-
     def match(pattern, cons=false)
-      md = pattern.match(@buffer)
-      @buffer = $' if cons and md
-      return md
+      if cons
+        @scanner.scan(pattern).nil? ? nil : @scanner
+      else
+        @scanner.check(pattern).nil? ? nil : @scanner
+      end
     end
 
     # @return true if the Source is exhausted
     def empty?
-      @buffer == ""
-    end
-
-    def position
-      @orig.index( @buffer )
+      @scanner.eos?
     end
 
     # @return the current line in the source
     def current_line
       lines = @orig.split
-      res = lines.grep @buffer[0..30]
+      res = lines.grep @scanner.rest[0..30]
       res = res[-1] if res.kind_of? Array
       lines.index( res ) if res
     end
 
     private
+
     def detect_encoding
-      buffer_encoding = @buffer.encoding
+      scanner_encoding = @scanner.rest.encoding
       detected_encoding = "UTF-8"
       begin
-        @buffer.force_encoding("ASCII-8BIT")
-        if @buffer[0, 2] == "\xfe\xff"
-          @buffer[0, 2] = ""
+        @scanner.string.force_encoding("ASCII-8BIT")
+        if @scanner.scan(/\xfe\xff/n)
           detected_encoding = "UTF-16BE"
-        elsif @buffer[0, 2] == "\xff\xfe"
-          @buffer[0, 2] = ""
+        elsif @scanner.scan(/\xff\xfe/n)
           detected_encoding = "UTF-16LE"
-        elsif @buffer[0, 3] == "\xef\xbb\xbf"
-          @buffer[0, 3] = ""
+        elsif @scanner.scan(/\xef\xbb\xbf/n)
           detected_encoding = "UTF-8"
         end
       ensure
-        @buffer.force_encoding(buffer_encoding)
+        @scanner.string.force_encoding(scanner_encoding)
       end
       self.encoding = detected_encoding
     end
 
     def encoding_updated
       if @encoding != 'UTF-8'
-        @buffer = decode(@buffer)
+        @scanner.string = decode(@scanner.rest)
         @to_utf = true
       else
         @to_utf = false
-        @buffer.force_encoding ::Encoding::UTF_8
+        @scanner.string.force_encoding(::Encoding::UTF_8)
       end
     end
   end
@@ -172,7 +138,7 @@ def initialize(arg, block_size=500, encoding=nil)
       end
 
       if !@to_utf and
-          @buffer.respond_to?(:force_encoding) and
+          @orig.respond_to?(:force_encoding) and
           @source.respond_to?(:external_encoding) and
           @source.external_encoding != ::Encoding::UTF_8
         @force_utf8 = true
@@ -181,65 +147,44 @@ def initialize(arg, block_size=500, encoding=nil)
       end
     end
 
-    def scan(pattern, cons=false)
-      rv = super
-      # You'll notice that this next section is very similar to the same
-      # section in match(), but just a liiittle different.  This is
-      # because it is a touch faster to do it this way with scan()
-      # than the way match() does it; enough faster to warrant duplicating
-      # some code
-      if rv.size == 0
-        until @buffer =~ pattern or @source.nil?
-          begin
-            @buffer << readline
-          rescue Iconv::IllegalSequence
-            raise
-          rescue
-            @source = nil
-          end
-        end
-        rv = super
-      end
-      rv.taint
-      rv
-    end
-
     def read
       begin
-        @buffer << readline
+        # NOTE: `@scanner << readline` does not free memory, so when parsing huge XML in JRuby's DOM,
+        # out-of-memory error `Java::JavaLang::OutOfMemoryError: Java heap space` occurs.
+        # `@scanner.string = @scanner.rest + readline` frees memory that is already consumed
+        # and avoids this problem.
+        @scanner.string = @scanner.rest + readline
       rescue Exception, NameError
         @source = nil
       end
     end
 
-    def consume( pattern )
-      match( pattern, true )
-    end
-
     def match( pattern, cons=false )
-      rv = pattern.match(@buffer)
-      @buffer = $' if cons and rv
-      while !rv and @source
+      if cons
+        md = @scanner.scan(pattern)
+      else
+        md = @scanner.check(pattern)
+      end
+      while md.nil? and @source
         begin
-          @buffer << readline
-          rv = pattern.match(@buffer)
-          @buffer = $' if cons and rv
+          @scanner << readline
+          if cons
+            md = @scanner.scan(pattern)
+          else
+            md = @scanner.check(pattern)
+          end
         rescue
           @source = nil
         end
       end
-      rv.taint
-      rv
+
+      md.nil? ? nil : @scanner
     end
 
     def empty?
       super and ( @source.nil? || @source.eof? )
     end
 
-    def position
-      @er_source.pos rescue 0
-    end
-
     # @return the current line in the source
     def current_line
       begin
@@ -289,7 +234,7 @@ def encoding_updated
         @source.set_encoding(@encoding, @encoding)
       end
       @line_break = encode(">")
-      @pending_buffer, @buffer = @buffer, ""
+      @pending_buffer, @scanner.string = @scanner.rest, ""
       @pending_buffer.force_encoding(@encoding)
       super
     end
diff --git a/test/rexml/parse/test_entity_declaration.rb b/test/rexml/parse/test_entity_declaration.rb
new file mode 100644
index 0000000000..e15deec60d
--- /dev/null
+++ b/test/rexml/parse/test_entity_declaration.rb
@@ -0,0 +1,36 @@
+# frozen_string_literal: false
+require 'test/unit'
+require 'rexml/document'
+
+module REXMLTests
+  class TestParseEntityDeclaration < Test::Unit::TestCase
+    private
+    def xml(internal_subset)
+      <<-XML
+<!DOCTYPE r SYSTEM "urn:x-henrikmartensson:test" [
+#{internal_subset}
+]>
+<r/>
+      XML
+    end
+
+    def parse(internal_subset)
+      REXML::Document.new(xml(internal_subset)).doctype
+    end
+
+    def test_empty
+      exception = assert_raise(REXML::ParseException) do
+        parse(<<-INTERNAL_SUBSET)
+<!ENTITY>
+        INTERNAL_SUBSET
+      end
+      assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed notation declaration: name is missing
+Line: 5
+Position: 72
+Last 80 unconsumed characters:
+ <!ENTITY>  ]> <r/>
+      DETAIL
+    end
+  end
+end
diff --git a/test/rexml/test_core.rb b/test/rexml/test_core.rb
index ee5438d5e5..d4ece491b9 100644
--- a/test/rexml/test_core.rb
+++ b/test/rexml/test_core.rb
@@ -681,7 +681,7 @@ def test_iso_8859_1_output_function
       koln_iso_8859_1 = "K\xF6ln"
       koln_utf8 = "K\xc3\xb6ln"
       source = Source.new( koln_iso_8859_1, 'iso-8859-1' )
-      results = source.scan(/.*/)[0]
+      results = source.match(/.*/)[0]
       koln_utf8.force_encoding('UTF-8') if koln_utf8.respond_to?(:force_encoding)
       assert_equal koln_utf8, results
       output << results

From 77128555476cb0db798e2912fb3a07d6411dc320 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Sun, 21 Jan 2024 20:02:00 +0900
Subject: [PATCH 3/7] Use `@scanner << readline` instead of `@scanner.string =
 @scanner.rest + readline` (#107)

## Why

JRuby's `StringScanner#<<` and `StringScanner#scan` OutOfMemoryError has
been resolved in strscan gem 3.0.9.

https://github.com/ruby/strscan/issues/83

## Benchmark

```
RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22]
Calculating -------------------------------------
                         before       after  before(YJIT)  after(YJIT)
                 dom     10.958      11.044        16.615       16.783 i/s -     100.000 times in 9.126104s 9.055023s 6.018799s 5.958437s
                 sax     29.624      29.609        44.390       45.370 i/s -     100.000 times in 3.375641s 3.377372s 2.252774s 2.204080s
                pull     33.868      34.695        51.173       53.492 i/s -     100.000 times in 2.952679s 2.882229s 1.954138s 1.869422s
              stream     31.719      32.351        43.604       45.403 i/s -     100.000 times in 3.152713s 3.091052s 2.293356s 2.202514s

Comparison:
                              dom
         after(YJIT):        16.8 i/s
        before(YJIT):        16.6 i/s - 1.01x  slower
               after:        11.0 i/s - 1.52x  slower
              before:        11.0 i/s - 1.53x  slower

                              sax
         after(YJIT):        45.4 i/s
        before(YJIT):        44.4 i/s - 1.02x  slower
              before:        29.6 i/s - 1.53x  slower
               after:        29.6 i/s - 1.53x  slower

                             pull
         after(YJIT):        53.5 i/s
        before(YJIT):        51.2 i/s - 1.05x  slower
               after:        34.7 i/s - 1.54x  slower
              before:        33.9 i/s - 1.58x  slower

                           stream
         after(YJIT):        45.4 i/s
        before(YJIT):        43.6 i/s - 1.04x  slower
               after:        32.4 i/s - 1.40x  slower
              before:        31.7 i/s - 1.43x  slower

```

- YJIT=ON : 1.01x - 1.05x faster
- YJIT=OFF : 1.00x - 1.02x faster
===
Backported, as the PR #114 and mainly PR #126 count with the method using `<<`
instead of setting the scanner's string directly.

https://github.com/ruby/rexml/commit/77128555476cb0db798e2912fb3a07d6411dc320
---
 lib/rexml/source.rb | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index 9883e79210..4624777fa0 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -149,11 +149,7 @@ def initialize(arg, block_size=500, encoding=nil)
 
     def read
       begin
-        # NOTE: `@scanner << readline` does not free memory, so when parsing huge XML in JRuby's DOM,
-        # out-of-memory error `Java::JavaLang::OutOfMemoryError: Java heap space` occurs.
-        # `@scanner.string = @scanner.rest + readline` frees memory that is already consumed
-        # and avoids this problem.
-        @scanner.string = @scanner.rest + readline
+        @scanner << readline
       rescue Exception, NameError
         @source = nil
       end

From 370666e314816b57ecd5878e757224c3b6bc93f5 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Tue, 27 Feb 2024 09:48:35 +0900
Subject: [PATCH 4/7] Use more StringScanner based API to parse XML (#114)

## Why?

Improve maintainability by optimizing the process so that the parsing
process proceeds using StringScanner#scan.

## Changed
- Change `REXML::Parsers::BaseParser` from `frozen_string_literal:
false` to `frozen_string_literal: true`.
- Added `Source#string=` method for error message output.
- Added TestParseDocumentTypeDeclaration#test_no_name test case.
- Of the `intSubset` of DOCTYPE, "<!" added consideration for processing
`Comments` that begin with "<!".

## [Benchmark]

```
RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22]
Calculating -------------------------------------
                         before       after  before(YJIT)  after(YJIT)
                 dom     11.240      10.569        17.173       18.219 i/s -     100.000 times in 8.896882s 9.461267s 5.823007s 5.488884s
                 sax     31.812      30.716        48.383       52.532 i/s -     100.000 times in 3.143500s 3.255655s 2.066861s 1.903600s
                pull     36.855      36.354        56.718       61.443 i/s -     100.000 times in 2.713300s 2.750693s 1.763099s 1.627523s
              stream     34.176      34.758        49.801       54.622 i/s -     100.000 times in 2.925991s 2.877065s 2.008003s 1.830779s

Comparison:
                              dom
         after(YJIT):        18.2 i/s
        before(YJIT):        17.2 i/s - 1.06x  slower
              before:        11.2 i/s - 1.62x  slower
               after:        10.6 i/s - 1.72x  slower

                              sax
         after(YJIT):        52.5 i/s
        before(YJIT):        48.4 i/s - 1.09x  slower
              before:        31.8 i/s - 1.65x  slower
               after:        30.7 i/s - 1.71x  slower

                             pull
         after(YJIT):        61.4 i/s
        before(YJIT):        56.7 i/s - 1.08x  slower
              before:        36.9 i/s - 1.67x  slower
               after:        36.4 i/s - 1.69x  slower

                           stream
         after(YJIT):        54.6 i/s
        before(YJIT):        49.8 i/s - 1.10x  slower
               after:        34.8 i/s - 1.57x  slower
              before:        34.2 i/s - 1.60x  slower

```

- YJIT=ON : 1.06x - 1.10x faster
- YJIT=OFF : 0.94x - 1.01x faster

---------

Co-authored-by: Sutou Kouhei <kou@clear-code.com>

===
This commit introduces wider use of StringScanner in
lib/rexml/parsers/baseparse.rb.

Which is relied upon in later patches.

MISSING_ATTRIBUTE_QUOTES is not present in upstream
lib/rexml/parsers/baseparser.rb at this point.
While it doesn't seem to be used, this patch does not
remove the constant so let's keep it around.

https://github.com/ruby/rexml/commit/370666e314816b57ecd5878e757224c3b6bc93f5
---
 lib/rexml/parsers/baseparser.rb               | 328 ++++++++++--------
 lib/rexml/source.rb                           |  31 +-
 .../parse/test_document_type_declaration.rb   |  15 +
 3 files changed, 205 insertions(+), 169 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index c29783af5c..dafc1b1449 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -1,4 +1,4 @@
-# frozen_string_literal: false
+# frozen_string_literal: true
 
 require "strscan"
 
@@ -121,6 +121,19 @@ class BaseParser
       ######################################################################
       MISSING_ATTRIBUTE_QUOTES = /^<#{QNAME_STR}\s+#{QNAME_STR}\s*=\s*[^"']/um
 
+      module Private
+        INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
+        TAG_PATTERN = /((?>#{QNAME_STR}))/um
+        CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
+        ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
+        NAME_PATTERN = /\s*#{NAME}/um
+        GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
+        PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
+        ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
+      end
+      private_constant :Private
+      include Private
+
       def initialize( source )
         self.stream = source
         @listeners = []
@@ -206,180 +219,172 @@ def pull_event
         #STDERR.puts @source.encoding
         #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
         if @document_status == nil
-          word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
-          word = word[1] unless word.nil?
-          #STDERR.puts "WORD = #{word.inspect}"
-          case word
-          when COMMENT_START
-            return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
-          when XMLDECL_START
-            #STDERR.puts "XMLDECL"
-            results = @source.match( XMLDECL_PATTERN, true )[1]
-            version = VERSION.match( results )
-            version = version[1] unless version.nil?
-            encoding = ENCODING.match(results)
-            encoding = encoding[1] unless encoding.nil?
-            if need_source_encoding_update?(encoding)
-              @source.encoding = encoding
-            end
-            if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
-              encoding = "UTF-16"
-            end
-            standalone = STANDALONE.match(results)
-            standalone = standalone[1] unless standalone.nil?
-            return [ :xmldecl, version, encoding, standalone ]
-          when INSTRUCTION_START
+          if @source.match("<?", true)
             return process_instruction
-          when DOCTYPE_START
-            base_error_message = "Malformed DOCTYPE"
-            @source.match(DOCTYPE_START, true)
-            @nsstack.unshift(curr_ns=Set.new)
-            name = parse_name(base_error_message)
-            if @source.match(/\A\s*\[/um, true)
-              id = [nil, nil, nil]
-              @document_status = :in_doctype
-            elsif @source.match(/\A\s*>/um, true)
-              id = [nil, nil, nil]
-              @document_status = :after_doctype
-            else
-              id = parse_id(base_error_message,
-                            accept_external_id: true,
-                            accept_public_id: false)
-              if id[0] == "SYSTEM"
-                # For backward compatibility
-                id[1], id[2] = id[2], nil
+          elsif @source.match("<!", true)
+            if @source.match("--", true)
+              return [ :comment, @source.match(/(.*?)-->/um, true)[1] ]
+            elsif @source.match("DOCTYPE", true)
+              base_error_message = "Malformed DOCTYPE"
+              unless @source.match(/\s+/um, true)
+                if @source.match(">")
+                  message = "#{base_error_message}: name is missing"
+                else
+                  message = "#{base_error_message}: invalid name"
+                end
+                @source.string = "<!DOCTYPE" + @source.buffer
+                raise REXML::ParseException.new(message, @source)
               end
-              if @source.match(/\A\s*\[/um, true)
-               @document_status = :in_doctype
-              elsif @source.match(/\A\s*>/um, true)
+              @nsstack.unshift(curr_ns=Set.new)
+              name = parse_name(base_error_message)
+              if @source.match(/\s*\[/um, true)
+                id = [nil, nil, nil]
+                @document_status = :in_doctype
+              elsif @source.match(/\s*>/um, true)
+                id = [nil, nil, nil]
                 @document_status = :after_doctype
               else
-                message = "#{base_error_message}: garbage after external ID"
-                raise REXML::ParseException.new(message, @source)
+                id = parse_id(base_error_message,
+                              accept_external_id: true,
+                              accept_public_id: false)
+                if id[0] == "SYSTEM"
+                  # For backward compatibility
+                  id[1], id[2] = id[2], nil
+                end
+                if @source.match(/\s*\[/um, true)
+                  @document_status = :in_doctype
+                elsif @source.match(/\s*>/um, true)
+                  @document_status = :after_doctype
+                else
+                  message = "#{base_error_message}: garbage after external ID"
+                  raise REXML::ParseException.new(message, @source)
+                end
               end
-            end
-            args = [:start_doctype, name, *id]
-            if @document_status == :after_doctype
-              @source.match(/\A\s*/um, true)
-              @stack << [ :end_doctype ]
-            end
-            return args
-          when /\A\s+/
-          else
-            @document_status = :after_doctype
-            if @source.encoding == "UTF-8"
-              @source.buffer_encoding = ::Encoding::UTF_8
+              args = [:start_doctype, name, *id]
+              if @document_status == :after_doctype
+                @source.match(/\s*/um, true)
+                @stack << [ :end_doctype ]
+              end
+              return args
+            else
+              message = "Invalid XML"
+              raise REXML::ParseException.new(message, @source)
             end
           end
         end
         if @document_status == :in_doctype
-          md = @source.match(/\A\s*(.*?>)/um)
-          case md[1]
-          when SYSTEMENTITY
-            match = @source.match( SYSTEMENTITY, true )[1]
-            return [ :externalentity, match ]
-
-          when ELEMENTDECL_START
-            return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
-
-          when ENTITY_START
-            match = [:entitydecl, *@source.match( ENTITYDECL, true ).captures.compact]
-            ref = false
-            if match[1] == '%'
-              ref = true
-              match.delete_at 1
-            end
-            # Now we have to sort out what kind of entity reference this is
-            if match[2] == 'SYSTEM'
-              # External reference
-              match[3] = match[3][1..-2] # PUBID
-              match.delete_at(4) if match.size > 4 # Chop out NDATA decl
-              # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
-            elsif match[2] == 'PUBLIC'
-              # External reference
-              match[3] = match[3][1..-2] # PUBID
-              match[4] = match[4][1..-2] # HREF
-              match.delete_at(5) if match.size > 5 # Chop out NDATA decl
-              # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
-            else
-              match[2] = match[2][1..-2]
-              match.pop if match.size == 4
-              # match is [ :entity, name, value ]
-            end
-            match << '%' if ref
-            return match
-          when ATTLISTDECL_START
-            md = @source.match( ATTLISTDECL_PATTERN, true )
-            raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
-            element = md[1]
-            contents = md[0]
-
-            pairs = {}
-            values = md[0].scan( ATTDEF_RE )
-            values.each do |attdef|
-              unless attdef[3] == "#IMPLIED"
-                attdef.compact!
-                val = attdef[3]
-                val = attdef[4] if val == "#FIXED "
-                pairs[attdef[0]] = val
-                if attdef[0] =~ /^xmlns:(.*)/
-                  @nsstack[0] << $1
-                end
+          @source.match(/\s*/um, true) # skip spaces
+          if @source.match("<!", true)
+            if @source.match("ELEMENT", true)
+              md = @source.match(/(.*?)>/um, true)
+              raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
+              return [ :elementdecl, "<!ELEMENT" + md[1] ]
+            elsif @source.match("ENTITY", true)
+              match = [:entitydecl, *@source.match(ENTITYDECL_PATTERN, true).captures.compact]
+              ref = false
+              if match[1] == '%'
+                ref = true
+                match.delete_at 1
               end
-            end
-            return [ :attlistdecl, element, pairs, contents ]
-          when NOTATIONDECL_START
-            base_error_message = "Malformed notation declaration"
-            unless @source.match(/\A\s*<!NOTATION\s+/um, true)
-              if @source.match(/\A\s*<!NOTATION\s*>/um)
-                message = "#{base_error_message}: name is missing"
+              # Now we have to sort out what kind of entity reference this is
+              if match[2] == 'SYSTEM'
+                # External reference
+                match[3] = match[3][1..-2] # PUBID
+                match.delete_at(4) if match.size > 4 # Chop out NDATA decl
+                # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
+              elsif match[2] == 'PUBLIC'
+                # External reference
+                match[3] = match[3][1..-2] # PUBID
+                match[4] = match[4][1..-2] # HREF
+                match.delete_at(5) if match.size > 5 # Chop out NDATA decl
+                # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
               else
-                message = "#{base_error_message}: invalid declaration name"
+                match[2] = match[2][1..-2]
+                match.pop if match.size == 4
+                # match is [ :entity, name, value ]
               end
-              raise REXML::ParseException.new(message, @source)
-            end
-            name = parse_name(base_error_message)
-            id = parse_id(base_error_message,
-                          accept_external_id: true,
-                          accept_public_id: true)
-            unless @source.match(/\A\s*>/um, true)
-              message = "#{base_error_message}: garbage before end >"
-              raise REXML::ParseException.new(message, @source)
+              match << '%' if ref
+              return match
+            elsif @source.match("ATTLIST", true)
+              md = @source.match(ATTLISTDECL_END, true)
+              raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
+              element = md[1]
+              contents = md[0]
+
+              pairs = {}
+              values = md[0].scan( ATTDEF_RE )
+              values.each do |attdef|
+                unless attdef[3] == "#IMPLIED"
+                  attdef.compact!
+                  val = attdef[3]
+                  val = attdef[4] if val == "#FIXED "
+                  pairs[attdef[0]] = val
+                  if attdef[0] =~ /^xmlns:(.*)/
+                    @nsstack[0] << $1
+                  end
+                end
+              end
+              return [ :attlistdecl, element, pairs, contents ]
+            elsif @source.match("NOTATION", true)
+              base_error_message = "Malformed notation declaration"
+              unless @source.match(/\s+/um, true)
+                if @source.match(">")
+                  message = "#{base_error_message}: name is missing"
+                else
+                  message = "#{base_error_message}: invalid name"
+                end
+                @source.string = " <!NOTATION" + @source.buffer
+                raise REXML::ParseException.new(message, @source)
+              end
+              name = parse_name(base_error_message)
+              id = parse_id(base_error_message,
+                            accept_external_id: true,
+                            accept_public_id: true)
+              unless @source.match(/\s*>/um, true)
+                message = "#{base_error_message}: garbage before end >"
+                raise REXML::ParseException.new(message, @source)
+              end
+              return [:notationdecl, name, *id]
+            elsif md = @source.match(/--(.*?)-->/um, true)
+              case md[1]
+              when /--/, /-\z/
+                raise REXML::ParseException.new("Malformed comment", @source)
+              end
+              return [ :comment, md[1] ] if md
             end
-            return [:notationdecl, name, *id]
-          when DOCTYPE_END
+          elsif match = @source.match(/(%.*?;)\s*/um, true)
+            return [ :externalentity, match[1] ]
+          elsif @source.match(/\]\s*>/um, true)
             @document_status = :after_doctype
-            @source.match( DOCTYPE_END, true )
             return [ :end_doctype ]
           end
         end
         if @document_status == :after_doctype
-          @source.match(/\A\s*/um, true)
+          @source.match(/\s*/um, true)
         end
         begin
-          @source.read if @source.buffer.size<2
-          if @source.buffer[0] == ?<
-            if @source.buffer[1] == ?/
+          if @source.match("<", true)
+            if @source.match("/", true)
               @nsstack.shift
               last_tag = @tags.pop
-              #md = @source.match_to_consume( '>', CLOSE_MATCH)
-              md = @source.match( CLOSE_MATCH, true )
+              md = @source.match(CLOSE_PATTERN, true)
               if md and !last_tag
                 message = "Unexpected top-level end tag (got '#{md[1]}')"
                 raise REXML::ParseException.new(message, @source)
               end
               if md.nil? or last_tag != md[1]
                 message = "Missing end tag for '#{last_tag}'"
-                message << " (got '#{md[1]}')" if md
+                message += " (got '#{md[1]}')" if md
+                @source.string = "</" + @source.buffer if md.nil?
                 raise REXML::ParseException.new(message, @source)
               end
               return [ :end_element, last_tag ]
-            elsif @source.buffer[1] == ?!
-              md = @source.match(/\A(\s*[^>]*>)/um)
+            elsif @source.match("!", true)
+              md = @source.match(/([^>]*>)/um)
               #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
               raise REXML::ParseException.new("Malformed node", @source) unless md
-              if md[0][2] == ?-
-                md = @source.match( COMMENT_PATTERN, true )
+              if md[0][0] == ?-
+                md = @source.match(/--(.*?)-->/um, true)
 
                 case md[1]
                 when /--/, /-\z/
@@ -388,19 +393,18 @@ def pull_event
 
                 return [ :comment, md[1] ] if md
               else
-                md = @source.match( CDATA_PATTERN, true )
+                md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
                 return [ :cdata, md[1] ] if md
               end
               raise REXML::ParseException.new( "Declarations can only occur "+
                 "in the doctype declaration.", @source)
-            elsif @source.buffer[1] == ??
+            elsif @source.match("?", true)
               return process_instruction
             else
               # Get the next tag
-              md = @source.match(TAG_MATCH, true)
+              md = @source.match(TAG_PATTERN, true)
               unless md
-                # Check for missing attribute quotes
-                raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES )
+                @source.string = "<" + @source.buffer
                 raise REXML::ParseException.new("malformed XML: missing tag start", @source)
               end
               tag = md[1]
@@ -425,7 +429,7 @@ def pull_event
               return [ :start_element, tag, attributes ]
             end
           else
-            md = @source.match( TEXT_PATTERN, true )
+            md = @source.match(/([^<]*)/um, true)
             text = md[1]
             if md[0].length == 0
               @source.match( /(\s+)/, true )
@@ -472,8 +476,7 @@ def normalize( input, entities=nil, entity_filter=nil )
 
       # Unescapes all possible entities
       def unnormalize( string, entities=nil, filter=nil )
-        rv = string.clone
-        rv.gsub!( /\r\n?/, "\n" )
+        rv = string.gsub( /\r\n?/, "\n" )
         matches = rv.scan( REFERENCE_RE )
         return rv if matches.size == 0
         rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
@@ -508,9 +511,9 @@ def need_source_encoding_update?(xml_declaration_encoding)
       end
 
       def parse_name(base_error_message)
-        md = @source.match(/\A\s*#{NAME}/um, true)
+        md = @source.match(NAME_PATTERN, true)
         unless md
-          if @source.match(/\A\s*\S/um)
+          if @source.match(/\s*\S/um)
             message = "#{base_error_message}: invalid name"
           else
             message = "#{base_error_message}: name is missing"
@@ -671,12 +674,29 @@ def parse_attributes(prefixes, curr_ns)
       end
 
       def process_instruction
-        match_data = @source.match(INSTRUCTION_PATTERN, true)
+        match_data = @source.match(INSTRUCTION_END, true)
         unless match_data
           message = "Invalid processing instruction node"
+          @source.string = "<?" + @source.buffer
           raise REXML::ParseException.new(message, @source)
         end
-        [:processing_instruction, match_data[1], match_data[4]]
+        if @document_status.nil? and match_data[1] == "xml"
+          content = match_data[2]
+          version = VERSION.match(content)
+          version = version[1] unless version.nil?
+          encoding = ENCODING.match(content)
+          encoding = encoding[1] unless encoding.nil?
+          if need_source_encoding_update?(encoding)
+            @source.encoding = encoding
+          end
+          if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
+            encoding = "UTF-16"
+          end
+          standalone = STANDALONE.match(content)
+          standalone = standalone[1] unless standalone.nil?
+          return [ :xmldecl, version, encoding, standalone ]
+        end
+        [:processing_instruction, match_data[1], match_data[2]]
       end
     end
   end
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index 4624777fa0..d2a6ad872e 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -76,6 +76,10 @@ def match(pattern, cons=false)
       end
     end
 
+    def string=(string)
+      @scanner.string = string
+    end
+
     # @return true if the Source is exhausted
     def empty?
       @scanner.eos?
@@ -150,28 +154,25 @@ def initialize(arg, block_size=500, encoding=nil)
     def read
       begin
         @scanner << readline
+        true
       rescue Exception, NameError
         @source = nil
+        false
       end
     end
 
     def match( pattern, cons=false )
-      if cons
-        md = @scanner.scan(pattern)
-      else
-        md = @scanner.check(pattern)
-      end
-      while md.nil? and @source
-        begin
-          @scanner << readline
-          if cons
-            md = @scanner.scan(pattern)
-          else
-            md = @scanner.check(pattern)
-          end
-        rescue
-          @source = nil
+      read if @scanner.eos? && @source
+      while true
+        if cons
+          md = @scanner.scan(pattern)
+        else
+          md = @scanner.check(pattern)
         end
+        break if md
+        return nil if pattern.is_a?(String) && pattern.bytesize <= @scanner.rest_size
+        return nil if @source.nil?
+        return nil unless read
       end
 
       md.nil? ? nil : @scanner
diff --git a/test/rexml/parse/test_document_type_declaration.rb b/test/rexml/parse/test_document_type_declaration.rb
index 55713909e7..8faa0b78be 100644
--- a/test/rexml/parse/test_document_type_declaration.rb
+++ b/test/rexml/parse/test_document_type_declaration.rb
@@ -36,6 +36,21 @@ def test_garbage_plus_before_name_at_line_start
 + r SYSTEM "urn:x-rexml:test" [ ]>  <r/> 
         DETAIL
       end
+
+      def test_no_name
+        exception = assert_raise(REXML::ParseException) do
+          parse(<<-DOCTYPE)
+<!DOCTYPE>
+          DOCTYPE
+        end
+        assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed DOCTYPE: name is missing
+Line: 3
+Position: 17
+Last 80 unconsumed characters:
+<!DOCTYPE>  <r/> 
+        DETAIL
+      end
     end
 
     class TestExternalID < self

From 0496940d5998ccbc50d16fb734993ab50fc60c2d Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Mon, 18 Mar 2024 23:30:47 +0900
Subject: [PATCH 5/7]  Optimize the parse_attributes method to use
 `Source#match` to parse XML.  (#119)

## Why?

Improve maintainability by consolidating processing into `Source#match`.

## Benchmark
```
RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22]
Calculating -------------------------------------
                         before       after  before(YJIT)  after(YJIT)
                 dom     10.891      10.622        16.356       17.403 i/s -     100.000 times in 9.182130s 9.414177s 6.113806s 5.746133s
                 sax     30.335      29.845        49.749       54.877 i/s -     100.000 times in 3.296483s 3.350595s 2.010071s 1.822259s
                pull     35.514      34.801        61.123       66.908 i/s -     100.000 times in 2.815793s 2.873484s 1.636041s 1.494591s
              stream     35.141      34.475        52.110       56.836 i/s -     100.000 times in 2.845646s 2.900638s 1.919017s 1.759456s

Comparison:
                              dom
         after(YJIT):        17.4 i/s
        before(YJIT):        16.4 i/s - 1.06x  slower
              before:        10.9 i/s - 1.60x  slower
               after:        10.6 i/s - 1.64x  slower

                              sax
         after(YJIT):        54.9 i/s
        before(YJIT):        49.7 i/s - 1.10x  slower
              before:        30.3 i/s - 1.81x  slower
               after:        29.8 i/s - 1.84x  slower

                             pull
         after(YJIT):        66.9 i/s
        before(YJIT):        61.1 i/s - 1.09x  slower
              before:        35.5 i/s - 1.88x  slower
               after:        34.8 i/s - 1.92x  slower

                           stream
         after(YJIT):        56.8 i/s
        before(YJIT):        52.1 i/s - 1.09x  slower
              before:        35.1 i/s - 1.62x  slower
               after:        34.5 i/s - 1.65x  slower

```

- YJIT=ON : 1.06x - 1.10x faster
- YJIT=OFF : 0.97x - 0.98x faster
===
"test_attribute_namespace_conflict" in test/rexml/test_core.rb
adjustment not applied as the commit that added the test is not present.

Wider use of Source#match to parse XML.

Changes the way BaseParser#parse_attribute
grabs fields that make up an XML attribute.

This is relied upon by upstream 4325835f92f3f142ebd91a3fdba4e1f1ab7f1cfb
that actually fixes CVE-2024-35176.

https://github.com/ruby/rexml/commit/0496940d5998ccbc50d16fb734993ab50fc60c2d
---
 lib/rexml/parsers/baseparser.rb  | 113 ++++++++++++-------------------
 test/rexml/parse/test_element.rb |   2 +-
 test/rexml/test_core.rb          |  17 ++++-
 3 files changed, 61 insertions(+), 71 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index dafc1b1449..45eabe8cc5 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -123,7 +123,7 @@ class BaseParser
 
       module Private
         INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
-        TAG_PATTERN = /((?>#{QNAME_STR}))/um
+        TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
         CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
         ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
         NAME_PATTERN = /\s*#{NAME}/um
@@ -592,85 +592,60 @@ def parse_id_invalid_details(accept_external_id:,
       def parse_attributes(prefixes, curr_ns)
         attributes = {}
         closed = false
-        match_data = @source.match(/^(.*?)(\/)?>/um, true)
-        if match_data.nil?
-          message = "Start tag isn't ended"
-          raise REXML::ParseException.new(message, @source)
-        end
-
-        raw_attributes = match_data[1]
-        closed = !match_data[2].nil?
-        return attributes, closed if raw_attributes.nil?
-        return attributes, closed if raw_attributes.empty?
-
-        scanner = StringScanner.new(raw_attributes)
-        until scanner.eos?
-          if scanner.scan(/\s+/)
-            break if scanner.eos?
-          end
-
-          pos = scanner.pos
-          loop do
-            break if scanner.scan(ATTRIBUTE_PATTERN)
-            unless scanner.scan(QNAME)
-              message = "Invalid attribute name: <#{scanner.rest}>"
-              raise REXML::ParseException.new(message, @source)
-            end
-            name = scanner[0]
-            unless scanner.scan(/\s*=\s*/um)
+        while true
+          if @source.match(">", true)
+            return attributes, closed
+          elsif @source.match("/>", true)
+            closed = true
+            return attributes, closed
+          elsif match = @source.match(QNAME, true)
+            name = match[1]
+            prefix = match[2]
+            local_part = match[3]
+
+            unless @source.match(/\s*=\s*/um, true)
               message = "Missing attribute equal: <#{name}>"
               raise REXML::ParseException.new(message, @source)
             end
-            quote = scanner.scan(/['"]/)
-            unless quote
-              message = "Missing attribute value start quote: <#{name}>"
-              raise REXML::ParseException.new(message, @source)
-            end
-            unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
-              match_data = @source.match(/^(.*?)(\/)?>/um, true)
-              if match_data
-                scanner << "/" if closed
-                scanner << ">"
-                scanner << match_data[1]
-                scanner.pos = pos
-                closed = !match_data[2].nil?
-                next
+            unless match = @source.match(/(['"])(.*?)\1\s*/um, true)
+              if match = @source.match(/(['"])/, true)
+                message =
+                  "Missing attribute value end quote: <#{name}>: <#{match[1]}>"
+                raise REXML::ParseException.new(message, @source)
+              else
+                message = "Missing attribute value start quote: <#{name}>"
+                raise REXML::ParseException.new(message, @source)
               end
-              message =
-                "Missing attribute value end quote: <#{name}>: <#{quote}>"
-              raise REXML::ParseException.new(message, @source)
             end
-          end
-          name = scanner[1]
-          prefix = scanner[2]
-          local_part = scanner[3]
-          # quote = scanner[4]
-          value = scanner[5]
-          if prefix == "xmlns"
-            if local_part == "xml"
-              if value != "http://www.w3.org/XML/1998/namespace"
-                msg = "The 'xml' prefix must not be bound to any other namespace "+
+            value = match[2]
+            if prefix == "xmlns"
+              if local_part == "xml"
+                if value != "http://www.w3.org/XML/1998/namespace"
+                  msg = "The 'xml' prefix must not be bound to any other namespace "+
+                    "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
+                  raise REXML::ParseException.new( msg, @source, self )
+                end
+              elsif local_part == "xmlns"
+                msg = "The 'xmlns' prefix must not be declared "+
                   "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
-                raise REXML::ParseException.new( msg, @source, self )
+                raise REXML::ParseException.new( msg, @source, self)
               end
-            elsif local_part == "xmlns"
-              msg = "The 'xmlns' prefix must not be declared "+
-                "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
-              raise REXML::ParseException.new( msg, @source, self)
+              curr_ns << local_part
+            elsif prefix
+              prefixes << prefix unless prefix == "xml"
             end
-            curr_ns << local_part
-          elsif prefix
-            prefixes << prefix unless prefix == "xml"
-          end
 
-          if attributes.has_key?(name)
-            msg = "Duplicate attribute #{name.inspect}"
-            raise REXML::ParseException.new(msg, @source, self)
-          end
+            if attributes.has_key?(name)
+              msg = "Duplicate attribute #{name.inspect}"
+              raise REXML::ParseException.new(msg, @source, self)
+            end
 
-          attributes[name] = value
+            attributes[name] = value
+          else
+            message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
+            raise REXML::ParseException.new(message, @source)
+          end
         end
-        return attributes, closed
       end
 
       def process_instruction
diff --git a/test/rexml/parse/test_element.rb b/test/rexml/parse/test_element.rb
index e8dce4b997..987214f3bb 100644
--- a/test/rexml/parse/test_element.rb
+++ b/test/rexml/parse/test_element.rb
@@ -43,7 +43,7 @@ def test_empty_namespace_attribute_name
 Line: 1
 Position: 13
 Last 80 unconsumed characters:
-
+:a=""></x>
         DETAIL
       end
 
diff --git a/test/rexml/test_core.rb b/test/rexml/test_core.rb
index d4ece491b9..46de90ccd7 100644
--- a/test/rexml/test_core.rb
+++ b/test/rexml/test_core.rb
@@ -1277,11 +1277,26 @@ def test_ticket_21
       exception = assert_raise(ParseException) do
         Document.new(src)
       end
-      assert_equal(<<-DETAIL, exception.to_s)
+      assert_equal(<<-DETAIL.chomp, exception.to_s)
 Missing attribute value start quote: <bar>
 Line: 1
 Position: 16
 Last 80 unconsumed characters:
+value/>
+      DETAIL
+    end
+
+    def test_parse_exception_on_missing_attribute_end_quote
+      src = '<foo bar="value/>'
+      exception = assert_raise(ParseException) do
+        Document.new(src)
+      end
+      assert_equal(<<-DETAIL.chomp, exception.to_s)
+Missing attribute value end quote: <bar>: <">
+Line: 1
+Position: 17
+Last 80 unconsumed characters:
+value/>
       DETAIL
     end
 

From 4325835f92f3f142ebd91a3fdba4e1f1ab7f1cfb Mon Sep 17 00:00:00 2001
From: Nobuyoshi Nakada <nobu@ruby-lang.org>
Date: Thu, 16 May 2024 11:26:51 +0900
Subject: [PATCH 6/7] Read quoted attributes in chunks (#126)

===
test/rexml/test_document.rb:
* test_gt_linear_performance
Test not included, Ruby 2.5 does not have assert_linear_performance
method available in the test suite. It was introduced in Ruby 2.7.
* Do not require 'core_assertions'
Current requires are fine for running the tests in mock build.

The fix for CVE-2024-35176. By reading the field in chunks instead of 1
by 1 it improves performance for large strings composed of angle
brackets.

https://github.com/ruby/rexml/commit/4325835f92f3f142ebd91a3fdba4e1f1ab7f1cfb
---
 lib/rexml/parsers/baseparser.rb | 20 ++++++++++----------
 lib/rexml/source.rb             | 29 ++++++++++++++++++++++++-----
 2 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 45eabe8cc5..304835e407 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -607,17 +607,17 @@ def parse_attributes(prefixes, curr_ns)
               message = "Missing attribute equal: <#{name}>"
               raise REXML::ParseException.new(message, @source)
             end
-            unless match = @source.match(/(['"])(.*?)\1\s*/um, true)
-              if match = @source.match(/(['"])/, true)
-                message =
-                  "Missing attribute value end quote: <#{name}>: <#{match[1]}>"
-                raise REXML::ParseException.new(message, @source)
-              else
-                message = "Missing attribute value start quote: <#{name}>"
-                raise REXML::ParseException.new(message, @source)
-              end
+            unless match = @source.match(/(['"])/, true)
+              message = "Missing attribute value start quote: <#{name}>"
+              raise REXML::ParseException.new(message, @source)
+            end
+            quote = match[1]
+            value = @source.read_until(quote)
+            unless value.chomp!(quote)
+              message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
+              raise REXML::ParseException.new(message, @source)
             end
-            value = match[2]
+            @source.match(/\s*/um, true)
             if prefix == "xmlns"
               if local_part == "xml"
                 if value != "http://www.w3.org/XML/1998/namespace"
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index d2a6ad872e..2a3bd36695 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -65,7 +65,11 @@ def encoding=(enc)
       encoding_updated
     end
 
-    def read
+    def read(term = nil)
+    end
+
+    def read_until(term)
+      @scanner.scan_until(Regexp.union(term)) or @scanner.rest
     end
 
     def match(pattern, cons=false)
@@ -151,9 +155,9 @@ def initialize(arg, block_size=500, encoding=nil)
       end
     end
 
-    def read
+    def read(term = nil)
       begin
-        @scanner << readline
+        @scanner << readline(term)
         true
       rescue Exception, NameError
         @source = nil
@@ -161,6 +165,21 @@ def read
       end
     end
 
+    def read_until(term)
+      pattern = Regexp.union(term)
+      data = []
+      begin
+        until str = @scanner.scan_until(pattern)
+          @scanner << readline(term)
+        end
+      rescue EOFError
+        @scanner.rest
+      else
+        read if @scanner.eos? and !@source.eof?
+        str
+      end
+    end
+
     def match( pattern, cons=false )
       read if @scanner.eos? && @source
       while true
@@ -204,8 +223,8 @@ def current_line
     end
 
     private
-    def readline
-      str = @source.readline(@line_break)
+    def readline(term = nil)
+      str = @source.readline(term || @line_break)
       if @pending_buffer
         if str.nil?
           str = @pending_buffer

From f1df7d13b3e57a5e059273d2f0870163c08d7420 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Mon, 20 May 2024 12:17:27 +0900
Subject: [PATCH 7/7] Add support for old strscan

Fix GH-132

If we support old strscan, users can also use strscan installed as a
default gem.

Reported by Adam. Thanks!!!

===
strscan shipped with Ruby 2.5 lacks a method StringScanner#captures.
ruby/rexml#105 brought this dependency on the method.

https://bugs.ruby-lang.org/issues/20516#note-11
https://github.com/ruby/rexml/commit/f1df7d13b3e57a5e059273d2f0870163c08d7420
---
 lib/rexml/parsers/baseparser.rb | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 304835e407..006d8acb9b 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -9,6 +9,17 @@
 
 module REXML
   module Parsers
+    if StringScanner::Version < "3.0.8"
+      module StringScannerCaptures
+        refine StringScanner do
+          def captures
+            values_at(*(1...size))
+          end
+        end
+      end
+      using StringScannerCaptures
+    end
+
     # = Using the Pull Parser
     # <em>This API is experimental, and subject to change.</em>
     #  parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )