From 11ccefabe3cadf5e55c45ee897647c618522bf61 Mon Sep 17 00:00:00 2001
From: Jarek Prokop <jprokop@redhat.com>
Date: Fri, 7 Jun 2024 15:06:51 +0200
Subject: [PATCH] Fix REXML DoS parsing an XML with many `<`s in an attribute
 value (CVE-2024-35176).

Resolves: RHEL-37877
---
 ruby.spec                                     |   20 +
 ....2.9-Fix-CVE-2024-35176-DoS-in-REXML.patch | 1662 +++++++++++++++++
 ...can-1.0.2-Accept-String-as-a-pattern.patch |  264 +++
 3 files changed, 1946 insertions(+)
 create mode 100644 rubygem-rexml-3.2.9-Fix-CVE-2024-35176-DoS-in-REXML.patch
 create mode 100644 rubygem-strscan-1.0.2-Accept-String-as-a-pattern.patch

diff --git a/ruby.spec b/ruby.spec
index e39a49d..4635f3d 100644
--- a/ruby.spec
+++ b/ruby.spec
@@ -251,6 +251,21 @@ Patch46: ruby-3.0.7-Fix-CVE-2024-27281-RCE-vulnerability-with-rdoc_options.patch
 # Backported from:
 # https://github.com/ruby/ruby/commit/6c6dca749d3f732b7be04bae20095a040c50fdb8
 Patch47: ruby-3.0.7-Fix-CVE-2024-27282-Memory-address-read-with-Regex.patch
+# Fix for REXML CVE-2024-35176 depends on being able to pass a string to the
+# scan method in addition to a regex.
+# https://github.com/ruby/strscan/pull/4
+Patch48: rubygem-strscan-1.0.2-Accept-String-as-a-pattern.patch
+# CVE-2024-35176 REXML: DoS parsing an XML with many `<`s in an attribute value.
+# The actual fix for the CVE is https://github.com/ruby/rexml/pull/126 ,
+# but that PR is depending on the content of a few previous PRs and commits.
+# https://github.com/ruby/rexml/commit/694239f0855668c986feba6f1b395ecd94a1f0bc
+# https://github.com/ruby/rexml/commit/810d2285235d5501a0a124f300832e6e9515da3c
+# https://github.com/ruby/rexml/commit/77128555476cb0db798e2912fb3a07d6411dc320
+# https://github.com/ruby/rexml/commit/370666e314816b57ecd5878e757224c3b6bc93f5
+# https://github.com/ruby/rexml/commit/0496940d5998ccbc50d16fb734993ab50fc60c2d
+# https://github.com/ruby/rexml/commit/4325835f92f3f142ebd91a3fdba4e1f1ab7f1cfb
+# https://github.com/ruby/rexml/commit/f1df7d13b3e57a5e059273d2f0870163c08d7420
+Patch49: rubygem-rexml-3.2.9-Fix-CVE-2024-35176-DoS-in-REXML.patch
 
 
 Requires: %{name}-libs%{?_isa} = %{version}-%{release}
@@ -669,6 +684,8 @@ sed -i 's/"evaluation\/incorrect_words.yaml"\.freeze, //' \
 %patch45 -p1
 %patch46 -p1
 %patch47 -p1
+%patch48 -p1
+%patch49 -p1
 
 # Provide an example of usage of the tapset:
 cp -a %{SOURCE3} .
@@ -1234,6 +1251,9 @@ OPENSSL_SYSTEM_CIPHERS_OVERRIDE=xyz_nonexistent_file OPENSSL_CONF='' \
 - Fix Arbitrary memory address read vulnerability with Regex search.
   (CVE-2024-27282)
   Resolves: RHEL-33867
+- Fix REXML DoS parsing an XML with many `<`s in an attribute value.
+  (CVE-2024-35176)
+  Resolves: RHEL-37877
 
 * Mon Jun 12 2023 Jarek Prokop <jprokop@redhat.com> - 2.5.9-111
 - Fix HTTP response splitting in CGI.
diff --git a/rubygem-rexml-3.2.9-Fix-CVE-2024-35176-DoS-in-REXML.patch b/rubygem-rexml-3.2.9-Fix-CVE-2024-35176-DoS-in-REXML.patch
new file mode 100644
index 0000000..bf6d96f
--- /dev/null
+++ b/rubygem-rexml-3.2.9-Fix-CVE-2024-35176-DoS-in-REXML.patch
@@ -0,0 +1,1662 @@
+From 694239f0855668c986feba6f1b395ecd94a1f0bc Mon Sep 17 00:00:00 2001
+From: Kouhei Sutou <kou@clear-code.com>
+Date: Fri, 28 Sep 2018 16:29:17 +0900
+Subject: [PATCH 1/7] Make PI parsing more robust
+
+PI target must be "NAME - (('X' | 'x') ('M' | 'm') ('L' | 'l'))". So
+'<?pos="3"?>' is invalid because "=" isn't included in NAME.
+
+Without this change, 'pos="3"' was accepted as PI target.
+
+===
+Commit implements #process_instruction method for baseparser
+that later patches rely upon.
+
+https://github.com/ruby/rexml/commit/694239f0855668c986feba6f1b395ecd94a1f0bc
+---
+ lib/rexml/parsers/baseparser.rb               | 18 ++++++++-----
+ test/rexml/data/t75.xml                       |  2 +-
+ .../parse/test_processing_instruction.rb      | 25 +++++++++++++++++++
+ 3 files changed, 38 insertions(+), 7 deletions(-)
+ create mode 100644 test/rexml/parse/test_processing_instruction.rb
+
+diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
+index e7ef695912..84a0bcdcec 100644
+--- a/lib/rexml/parsers/baseparser.rb
++++ b/lib/rexml/parsers/baseparser.rb
+@@ -61,7 +61,7 @@ class BaseParser
+       XMLDECL_START = /\A<\?xml\s/u;
+       XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
+       INSTRUCTION_START = /\A<\?/u
+-      INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um
++      INSTRUCTION_PATTERN = /<\?#{QNAME}(\s+.*?)?\?>/um
+       TAG_MATCH = /\A<((?>#{QNAME_STR}))/um
+       CLOSE_MATCH = /\A\s*<\/(#{QNAME_STR})\s*>/um
+ 
+@@ -229,7 +229,7 @@ def pull_event
+             standalone = standalone[1] unless standalone.nil?
+             return [ :xmldecl, version, encoding, standalone ]
+           when INSTRUCTION_START
+-            return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ]
++            return process_instruction
+           when DOCTYPE_START
+             base_error_message = "Malformed DOCTYPE"
+             @source.match(DOCTYPE_START, true)
+@@ -395,10 +395,7 @@ def pull_event
+               raise REXML::ParseException.new( "Declarations can only occur "+
+                 "in the doctype declaration.", @source)
+             elsif @source.buffer[1] == ??
+-              md = @source.match( INSTRUCTION_PATTERN, true )
+-              return [ :processing_instruction, md[1], md[2] ] if md
+-              raise REXML::ParseException.new( "Bad instruction declaration",
+-                @source)
++              return process_instruction
+             else
+               # Get the next tag
+               md = @source.match(TAG_MATCH, true)
+@@ -675,6 +672,15 @@ def parse_attributes(prefixes, curr_ns)
+         end
+         return attributes, closed
+       end
++
++      def process_instruction
++        match_data = @source.match(INSTRUCTION_PATTERN, true)
++        unless match_data
++          message = "Invalid processing instruction node"
++          raise REXML::ParseException.new(message, @source)
++        end
++        [:processing_instruction, match_data[1], match_data[4]]
++      end
+     end
+   end
+ end
+diff --git a/test/rexml/data/t75.xml b/test/rexml/data/t75.xml
+index 0911fb1b1a..eb3cccee4b 100644
+--- a/test/rexml/data/t75.xml
++++ b/test/rexml/data/t75.xml
+@@ -1,4 +1,4 @@
+-<?xml version="1.0" encoding="ISO-8859-1"?><?pos="3"?>
++<?xml version="1.0" encoding="ISO-8859-1"?>
+ <!-- generated by hnb 1.9.17 (http://hnb.sourceforge.net) -->
+ 
+ <!DOCTYPE tree[
+diff --git a/test/rexml/parse/test_processing_instruction.rb b/test/rexml/parse/test_processing_instruction.rb
+new file mode 100644
+index 0000000000..a23513fc6e
+--- /dev/null
++++ b/test/rexml/parse/test_processing_instruction.rb
+@@ -0,0 +1,25 @@
++require "test/unit"
++require "rexml/document"
++
++module REXMLTests
++  class TestParseProcessinInstruction < Test::Unit::TestCase
++    def parse(xml)
++      REXML::Document.new(xml)
++    end
++
++    class TestInvalid < self
++      def test_no_name
++        exception = assert_raise(REXML::ParseException) do
++          parse("<??>")
++        end
++        assert_equal(<<-DETAIL.chomp, exception.to_s)
++Invalid processing instruction node
++Line: 1
++Position: 4
++Last 80 unconsumed characters:
++<??>
++        DETAIL
++      end
++    end
++  end
++end
+
+From 810d2285235d5501a0a124f300832e6e9515da3c Mon Sep 17 00:00:00 2001
+From: NAITOH Jun <naitoh@gmail.com>
+Date: Wed, 17 Jan 2024 15:32:57 +0900
+Subject: [PATCH 2/7] Use string scanner with baseparser (#105)
+
+Using StringScanner reduces the string copying process and speeds up the
+process.
+
+And I removed unnecessary methods.
+
+https://github.com/ruby/rexml/actions/runs/7549990000/job/20554906140?pr=105
+
+```
+ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [x86_64-linux]
+Calculating -------------------------------------
+                     rexml 3.2.6      master  3.2.6(YJIT)  master(YJIT)
+                 dom       4.868       5.077        8.137         8.303 i/s -     100.000 times in 20.540529s 19.696590s 12.288900s 12.043666s
+                 sax      13.597      13.953       19.206        20.948 i/s -     100.000 times in 7.354343s 7.167142s 5.206745s 4.773765s
+                pull      15.641      16.918       22.266        25.378 i/s -     100.000 times in 6.393424s 5.910955s 4.491201s 3.940471s
+              stream      14.339      15.844       19.810        22.206 i/s -     100.000 times in 6.973856s 6.311350s 5.047957s 4.503244s
+
+Comparison:
+                              dom
+        master(YJIT):         8.3 i/s
+         3.2.6(YJIT):         8.1 i/s - 1.02x  slower
+              master:         5.1 i/s - 1.64x  slower
+         rexml 3.2.6:         4.9 i/s - 1.71x  slower
+
+                              sax
+        master(YJIT):        20.9 i/s
+         3.2.6(YJIT):        19.2 i/s - 1.09x  slower
+              master:        14.0 i/s - 1.50x  slower
+         rexml 3.2.6:        13.6 i/s - 1.54x  slower
+
+                             pull
+        master(YJIT):        25.4 i/s
+         3.2.6(YJIT):        22.3 i/s - 1.14x  slower
+              master:        16.9 i/s - 1.50x  slower
+         rexml 3.2.6:        15.6 i/s - 1.62x  slower
+
+                           stream
+        master(YJIT):        22.2 i/s
+         3.2.6(YJIT):        19.8 i/s - 1.12x  slower
+              master:        15.8 i/s - 1.40x  slower
+         rexml 3.2.6:        14.3 i/s - 1.55x  slower
+```
+
+- YJIT=ON : 1.02x - 1.14x faster
+- YJIT=OFF : 1.02x - 1.10x faster
+
+---------
+
+Co-authored-by: Sutou Kouhei <kou@cozmixng.org>
+
+===
+The PR focuses on enhancements in lib/rexml/source.rb.
+StringScanner is used instead of just a string for scanning and
+matching.
+This behavior is relied upon in later patches.
+
+Preparation for fix for CVE-2024-35176.
+
+https://github.com/ruby/rexml/commit/810d2285235d5501a0a124f300832e6e9515da3c
+---
+ lib/rexml/parsers/baseparser.rb             |  21 ++-
+ lib/rexml/source.rb                         | 149 ++++++--------------
+ test/rexml/parse/test_entity_declaration.rb |  36 +++++
+ test/rexml/test_core.rb                     |   2 +-
+ 4 files changed, 93 insertions(+), 115 deletions(-)
+ create mode 100644 test/rexml/parse/test_entity_declaration.rb
+
+diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
+index 84a0bcdcec..c29783af5c 100644
+--- a/lib/rexml/parsers/baseparser.rb
++++ b/lib/rexml/parsers/baseparser.rb
+@@ -98,7 +98,7 @@ class BaseParser
+       ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
+       PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
+       GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
+-      ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
++      ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
+ 
+       NOTATIONDECL_START = /\A\s*<!NOTATION/um
+       EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
+@@ -268,7 +268,7 @@ def pull_event
+           else
+             @document_status = :after_doctype
+             if @source.encoding == "UTF-8"
+-              @source.buffer.force_encoding(::Encoding::UTF_8)
++              @source.buffer_encoding = ::Encoding::UTF_8
+             end
+           end
+         end
+@@ -283,8 +283,7 @@ def pull_event
+             return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
+ 
+           when ENTITY_START
+-            match = @source.match( ENTITYDECL, true ).to_a.compact
+-            match[0] = :entitydecl
++            match = [:entitydecl, *@source.match( ENTITYDECL, true ).captures.compact]
+             ref = false
+             if match[1] == '%'
+               ref = true
+@@ -404,6 +403,7 @@ def pull_event
+                 raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES )
+                 raise REXML::ParseException.new("malformed XML: missing tag start", @source)
+               end
++              tag = md[1]
+               @document_status = :in_element
+               prefixes = Set.new
+               prefixes << md[2] if md[2]
+@@ -417,23 +417,20 @@ def pull_event
+               end
+ 
+               if closed
+-                @closed = md[1]
++                @closed = tag
+                 @nsstack.shift
+               else
+-                @tags.push( md[1] )
++                @tags.push( tag )
+               end
+-              return [ :start_element, md[1], attributes ]
++              return [ :start_element, tag, attributes ]
+             end
+           else
+             md = @source.match( TEXT_PATTERN, true )
++            text = md[1]
+             if md[0].length == 0
+               @source.match( /(\s+)/, true )
+             end
+-            #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
+-            #return [ :text, "" ] if md[0].length == 0
+-            # unnormalized = Text::unnormalize( md[1], self )
+-            # return PullEvent.new( :text, md[1], unnormalized )
+-            return [ :text, md[1] ]
++            return [ :text, text ]
+           end
+         rescue REXML::UndefinedNamespaceException
+           raise
+diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
+index af65cf4751..9883e79210 100644
+--- a/lib/rexml/source.rb
++++ b/lib/rexml/source.rb
+@@ -30,8 +30,6 @@ def SourceFactory::create_from(arg)
+   # objects and provides consumption of text
+   class Source
+     include Encoding
+-    # The current buffer (what we're going to read next)
+-    attr_reader :buffer
+     # The line number of the last consumed text
+     attr_reader :line
+     attr_reader :encoding
+@@ -41,7 +39,8 @@ class Source
+     # @param encoding if non-null, sets the encoding of the source to this
+     # value, overriding all encoding detection
+     def initialize(arg, encoding=nil)
+-      @orig = @buffer = arg
++      @orig = arg
++      @scanner = StringScanner.new(@orig)
+       if encoding
+         self.encoding = encoding
+       else
+@@ -50,6 +49,14 @@ def initialize(arg, encoding=nil)
+       @line = 0
+     end
+ 
++    # The current buffer (what we're going to read next)
++    def buffer
++      @scanner.rest
++    end
++
++    def buffer_encoding=(encoding)
++      @scanner.string.force_encoding(encoding)
++    end
+ 
+     # Inherited from Encoding
+     # Overridden to support optimized en/decoding
+@@ -58,98 +65,57 @@ def encoding=(enc)
+       encoding_updated
+     end
+ 
+-    # Scans the source for a given pattern.  Note, that this is not your
+-    # usual scan() method.  For one thing, the pattern argument has some
+-    # requirements; for another, the source can be consumed.  You can easily
+-    # confuse this method.  Originally, the patterns were easier
+-    # to construct and this method more robust, because this method
+-    # generated search regexps on the fly; however, this was
+-    # computationally expensive and slowed down the entire REXML package
+-    # considerably, since this is by far the most commonly called method.
+-    # @param pattern must be a Regexp, and must be in the form of
+-    # /^\s*(#{your pattern, with no groups})(.*)/.  The first group
+-    # will be returned; the second group is used if the consume flag is
+-    # set.
+-    # @param consume if true, the pattern returned will be consumed, leaving
+-    # everything after it in the Source.
+-    # @return the pattern, if found, or nil if the Source is empty or the
+-    # pattern is not found.
+-    def scan(pattern, cons=false)
+-      return nil if @buffer.nil?
+-      rv = @buffer.scan(pattern)
+-      @buffer = $' if cons and rv.size>0
+-      rv
+-    end
+-
+     def read
+     end
+ 
+-    def consume( pattern )
+-      @buffer = $' if pattern.match( @buffer )
+-    end
+-
+-    def match_to( char, pattern )
+-      return pattern.match(@buffer)
+-    end
+-
+-    def match_to_consume( char, pattern )
+-      md = pattern.match(@buffer)
+-      @buffer = $'
+-      return md
+-    end
+-
+     def match(pattern, cons=false)
+-      md = pattern.match(@buffer)
+-      @buffer = $' if cons and md
+-      return md
++      if cons
++        @scanner.scan(pattern).nil? ? nil : @scanner
++      else
++        @scanner.check(pattern).nil? ? nil : @scanner
++      end
+     end
+ 
+     # @return true if the Source is exhausted
+     def empty?
+-      @buffer == ""
+-    end
+-
+-    def position
+-      @orig.index( @buffer )
++      @scanner.eos?
+     end
+ 
+     # @return the current line in the source
+     def current_line
+       lines = @orig.split
+-      res = lines.grep @buffer[0..30]
++      res = lines.grep @scanner.rest[0..30]
+       res = res[-1] if res.kind_of? Array
+       lines.index( res ) if res
+     end
+ 
+     private
++
+     def detect_encoding
+-      buffer_encoding = @buffer.encoding
++      scanner_encoding = @scanner.rest.encoding
+       detected_encoding = "UTF-8"
+       begin
+-        @buffer.force_encoding("ASCII-8BIT")
+-        if @buffer[0, 2] == "\xfe\xff"
+-          @buffer[0, 2] = ""
++        @scanner.string.force_encoding("ASCII-8BIT")
++        if @scanner.scan(/\xfe\xff/n)
+           detected_encoding = "UTF-16BE"
+-        elsif @buffer[0, 2] == "\xff\xfe"
+-          @buffer[0, 2] = ""
++        elsif @scanner.scan(/\xff\xfe/n)
+           detected_encoding = "UTF-16LE"
+-        elsif @buffer[0, 3] == "\xef\xbb\xbf"
+-          @buffer[0, 3] = ""
++        elsif @scanner.scan(/\xef\xbb\xbf/n)
+           detected_encoding = "UTF-8"
+         end
+       ensure
+-        @buffer.force_encoding(buffer_encoding)
++        @scanner.string.force_encoding(scanner_encoding)
+       end
+       self.encoding = detected_encoding
+     end
+ 
+     def encoding_updated
+       if @encoding != 'UTF-8'
+-        @buffer = decode(@buffer)
++        @scanner.string = decode(@scanner.rest)
+         @to_utf = true
+       else
+         @to_utf = false
+-        @buffer.force_encoding ::Encoding::UTF_8
++        @scanner.string.force_encoding(::Encoding::UTF_8)
+       end
+     end
+   end
+@@ -172,7 +138,7 @@ def initialize(arg, block_size=500, encoding=nil)
+       end
+ 
+       if !@to_utf and
+-          @buffer.respond_to?(:force_encoding) and
++          @orig.respond_to?(:force_encoding) and
+           @source.respond_to?(:external_encoding) and
+           @source.external_encoding != ::Encoding::UTF_8
+         @force_utf8 = true
+@@ -181,65 +147,44 @@ def initialize(arg, block_size=500, encoding=nil)
+       end
+     end
+ 
+-    def scan(pattern, cons=false)
+-      rv = super
+-      # You'll notice that this next section is very similar to the same
+-      # section in match(), but just a liiittle different.  This is
+-      # because it is a touch faster to do it this way with scan()
+-      # than the way match() does it; enough faster to warrant duplicating
+-      # some code
+-      if rv.size == 0
+-        until @buffer =~ pattern or @source.nil?
+-          begin
+-            @buffer << readline
+-          rescue Iconv::IllegalSequence
+-            raise
+-          rescue
+-            @source = nil
+-          end
+-        end
+-        rv = super
+-      end
+-      rv.taint
+-      rv
+-    end
+-
+     def read
+       begin
+-        @buffer << readline
++        # NOTE: `@scanner << readline` does not free memory, so when parsing huge XML in JRuby's DOM,
++        # out-of-memory error `Java::JavaLang::OutOfMemoryError: Java heap space` occurs.
++        # `@scanner.string = @scanner.rest + readline` frees memory that is already consumed
++        # and avoids this problem.
++        @scanner.string = @scanner.rest + readline
+       rescue Exception, NameError
+         @source = nil
+       end
+     end
+ 
+-    def consume( pattern )
+-      match( pattern, true )
+-    end
+-
+     def match( pattern, cons=false )
+-      rv = pattern.match(@buffer)
+-      @buffer = $' if cons and rv
+-      while !rv and @source
++      if cons
++        md = @scanner.scan(pattern)
++      else
++        md = @scanner.check(pattern)
++      end
++      while md.nil? and @source
+         begin
+-          @buffer << readline
+-          rv = pattern.match(@buffer)
+-          @buffer = $' if cons and rv
++          @scanner << readline
++          if cons
++            md = @scanner.scan(pattern)
++          else
++            md = @scanner.check(pattern)
++          end
+         rescue
+           @source = nil
+         end
+       end
+-      rv.taint
+-      rv
++
++      md.nil? ? nil : @scanner
+     end
+ 
+     def empty?
+       super and ( @source.nil? || @source.eof? )
+     end
+ 
+-    def position
+-      @er_source.pos rescue 0
+-    end
+-
+     # @return the current line in the source
+     def current_line
+       begin
+@@ -289,7 +234,7 @@ def encoding_updated
+         @source.set_encoding(@encoding, @encoding)
+       end
+       @line_break = encode(">")
+-      @pending_buffer, @buffer = @buffer, ""
++      @pending_buffer, @scanner.string = @scanner.rest, ""
+       @pending_buffer.force_encoding(@encoding)
+       super
+     end
+diff --git a/test/rexml/parse/test_entity_declaration.rb b/test/rexml/parse/test_entity_declaration.rb
+new file mode 100644
+index 0000000000..e15deec60d
+--- /dev/null
++++ b/test/rexml/parse/test_entity_declaration.rb
+@@ -0,0 +1,36 @@
++# frozen_string_literal: false
++require 'test/unit'
++require 'rexml/document'
++
++module REXMLTests
++  class TestParseEntityDeclaration < Test::Unit::TestCase
++    private
++    def xml(internal_subset)
++      <<-XML
++<!DOCTYPE r SYSTEM "urn:x-henrikmartensson:test" [
++#{internal_subset}
++]>
++<r/>
++      XML
++    end
++
++    def parse(internal_subset)
++      REXML::Document.new(xml(internal_subset)).doctype
++    end
++
++    def test_empty
++      exception = assert_raise(REXML::ParseException) do
++        parse(<<-INTERNAL_SUBSET)
++<!ENTITY>
++        INTERNAL_SUBSET
++      end
++      assert_equal(<<-DETAIL.chomp, exception.to_s)
++Malformed notation declaration: name is missing
++Line: 5
++Position: 72
++Last 80 unconsumed characters:
++ <!ENTITY>  ]> <r/>
++      DETAIL
++    end
++  end
++end
+diff --git a/test/rexml/test_core.rb b/test/rexml/test_core.rb
+index ee5438d5e5..d4ece491b9 100644
+--- a/test/rexml/test_core.rb
++++ b/test/rexml/test_core.rb
+@@ -681,7 +681,7 @@ def test_iso_8859_1_output_function
+       koln_iso_8859_1 = "K\xF6ln"
+       koln_utf8 = "K\xc3\xb6ln"
+       source = Source.new( koln_iso_8859_1, 'iso-8859-1' )
+-      results = source.scan(/.*/)[0]
++      results = source.match(/.*/)[0]
+       koln_utf8.force_encoding('UTF-8') if koln_utf8.respond_to?(:force_encoding)
+       assert_equal koln_utf8, results
+       output << results
+
+From 77128555476cb0db798e2912fb3a07d6411dc320 Mon Sep 17 00:00:00 2001
+From: NAITOH Jun <naitoh@gmail.com>
+Date: Sun, 21 Jan 2024 20:02:00 +0900
+Subject: [PATCH 3/7] Use `@scanner << readline` instead of `@scanner.string =
+ @scanner.rest + readline` (#107)
+
+## Why
+
+JRuby's `StringScanner#<<` and `StringScanner#scan` OutOfMemoryError has
+been resolved in strscan gem 3.0.9.
+
+https://github.com/ruby/strscan/issues/83
+
+## Benchmark
+
+```
+RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml
+ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22]
+Calculating -------------------------------------
+                         before       after  before(YJIT)  after(YJIT)
+                 dom     10.958      11.044        16.615       16.783 i/s -     100.000 times in 9.126104s 9.055023s 6.018799s 5.958437s
+                 sax     29.624      29.609        44.390       45.370 i/s -     100.000 times in 3.375641s 3.377372s 2.252774s 2.204080s
+                pull     33.868      34.695        51.173       53.492 i/s -     100.000 times in 2.952679s 2.882229s 1.954138s 1.869422s
+              stream     31.719      32.351        43.604       45.403 i/s -     100.000 times in 3.152713s 3.091052s 2.293356s 2.202514s
+
+Comparison:
+                              dom
+         after(YJIT):        16.8 i/s
+        before(YJIT):        16.6 i/s - 1.01x  slower
+               after:        11.0 i/s - 1.52x  slower
+              before:        11.0 i/s - 1.53x  slower
+
+                              sax
+         after(YJIT):        45.4 i/s
+        before(YJIT):        44.4 i/s - 1.02x  slower
+              before:        29.6 i/s - 1.53x  slower
+               after:        29.6 i/s - 1.53x  slower
+
+                             pull
+         after(YJIT):        53.5 i/s
+        before(YJIT):        51.2 i/s - 1.05x  slower
+               after:        34.7 i/s - 1.54x  slower
+              before:        33.9 i/s - 1.58x  slower
+
+                           stream
+         after(YJIT):        45.4 i/s
+        before(YJIT):        43.6 i/s - 1.04x  slower
+               after:        32.4 i/s - 1.40x  slower
+              before:        31.7 i/s - 1.43x  slower
+
+```
+
+- YJIT=ON : 1.01x - 1.05x faster
+- YJIT=OFF : 1.00x - 1.02x faster
+===
+Backported, as the PR #114 and mainly PR #126 count with the method using `<<`
+instead of setting the scanner's string directly.
+
+https://github.com/ruby/rexml/commit/77128555476cb0db798e2912fb3a07d6411dc320
+---
+ lib/rexml/source.rb | 6 +-----
+ 1 file changed, 1 insertion(+), 5 deletions(-)
+
+diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
+index 9883e79210..4624777fa0 100644
+--- a/lib/rexml/source.rb
++++ b/lib/rexml/source.rb
+@@ -149,11 +149,7 @@ def initialize(arg, block_size=500, encoding=nil)
+ 
+     def read
+       begin
+-        # NOTE: `@scanner << readline` does not free memory, so when parsing huge XML in JRuby's DOM,
+-        # out-of-memory error `Java::JavaLang::OutOfMemoryError: Java heap space` occurs.
+-        # `@scanner.string = @scanner.rest + readline` frees memory that is already consumed
+-        # and avoids this problem.
+-        @scanner.string = @scanner.rest + readline
++        @scanner << readline
+       rescue Exception, NameError
+         @source = nil
+       end
+
+From 370666e314816b57ecd5878e757224c3b6bc93f5 Mon Sep 17 00:00:00 2001
+From: NAITOH Jun <naitoh@gmail.com>
+Date: Tue, 27 Feb 2024 09:48:35 +0900
+Subject: [PATCH 4/7] Use more StringScanner based API to parse XML (#114)
+
+## Why?
+
+Improve maintainability by optimizing the process so that the parsing
+process proceeds using StringScanner#scan.
+
+## Changed
+- Change `REXML::Parsers::BaseParser` from `frozen_string_literal:
+false` to `frozen_string_literal: true`.
+- Added `Source#string=` method for error message output.
+- Added TestParseDocumentTypeDeclaration#test_no_name test case.
+- Of the `intSubset` of DOCTYPE, "<!" added consideration for processing
+`Comments` that begin with "<!".
+
+## [Benchmark]
+
+```
+RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml
+ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22]
+Calculating -------------------------------------
+                         before       after  before(YJIT)  after(YJIT)
+                 dom     11.240      10.569        17.173       18.219 i/s -     100.000 times in 8.896882s 9.461267s 5.823007s 5.488884s
+                 sax     31.812      30.716        48.383       52.532 i/s -     100.000 times in 3.143500s 3.255655s 2.066861s 1.903600s
+                pull     36.855      36.354        56.718       61.443 i/s -     100.000 times in 2.713300s 2.750693s 1.763099s 1.627523s
+              stream     34.176      34.758        49.801       54.622 i/s -     100.000 times in 2.925991s 2.877065s 2.008003s 1.830779s
+
+Comparison:
+                              dom
+         after(YJIT):        18.2 i/s
+        before(YJIT):        17.2 i/s - 1.06x  slower
+              before:        11.2 i/s - 1.62x  slower
+               after:        10.6 i/s - 1.72x  slower
+
+                              sax
+         after(YJIT):        52.5 i/s
+        before(YJIT):        48.4 i/s - 1.09x  slower
+              before:        31.8 i/s - 1.65x  slower
+               after:        30.7 i/s - 1.71x  slower
+
+                             pull
+         after(YJIT):        61.4 i/s
+        before(YJIT):        56.7 i/s - 1.08x  slower
+              before:        36.9 i/s - 1.67x  slower
+               after:        36.4 i/s - 1.69x  slower
+
+                           stream
+         after(YJIT):        54.6 i/s
+        before(YJIT):        49.8 i/s - 1.10x  slower
+               after:        34.8 i/s - 1.57x  slower
+              before:        34.2 i/s - 1.60x  slower
+
+```
+
+- YJIT=ON : 1.06x - 1.10x faster
+- YJIT=OFF : 0.94x - 1.01x faster
+
+---------
+
+Co-authored-by: Sutou Kouhei <kou@clear-code.com>
+
+===
+This commit introduces wider use of StringScanner in
+lib/rexml/parsers/baseparse.rb.
+
+Which is relied upon in later patches.
+
+MISSING_ATTRIBUTE_QUOTES is not present in upstream
+lib/rexml/parsers/baseparser.rb at this point.
+While it doesn't seem to be used, this patch does not
+remove the constant so let's keep it around.
+
+https://github.com/ruby/rexml/commit/370666e314816b57ecd5878e757224c3b6bc93f5
+---
+ lib/rexml/parsers/baseparser.rb               | 328 ++++++++++--------
+ lib/rexml/source.rb                           |  31 +-
+ .../parse/test_document_type_declaration.rb   |  15 +
+ 3 files changed, 205 insertions(+), 169 deletions(-)
+
+diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
+index c29783af5c..dafc1b1449 100644
+--- a/lib/rexml/parsers/baseparser.rb
++++ b/lib/rexml/parsers/baseparser.rb
+@@ -1,4 +1,4 @@
+-# frozen_string_literal: false
++# frozen_string_literal: true
+ 
+ require "strscan"
+ 
+@@ -121,6 +121,19 @@ class BaseParser
+       ######################################################################
+       MISSING_ATTRIBUTE_QUOTES = /^<#{QNAME_STR}\s+#{QNAME_STR}\s*=\s*[^"']/um
+ 
++      module Private
++        INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
++        TAG_PATTERN = /((?>#{QNAME_STR}))/um
++        CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
++        ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
++        NAME_PATTERN = /\s*#{NAME}/um
++        GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
++        PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
++        ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
++      end
++      private_constant :Private
++      include Private
++
+       def initialize( source )
+         self.stream = source
+         @listeners = []
+@@ -206,180 +219,172 @@ def pull_event
+         #STDERR.puts @source.encoding
+         #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
+         if @document_status == nil
+-          word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
+-          word = word[1] unless word.nil?
+-          #STDERR.puts "WORD = #{word.inspect}"
+-          case word
+-          when COMMENT_START
+-            return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
+-          when XMLDECL_START
+-            #STDERR.puts "XMLDECL"
+-            results = @source.match( XMLDECL_PATTERN, true )[1]
+-            version = VERSION.match( results )
+-            version = version[1] unless version.nil?
+-            encoding = ENCODING.match(results)
+-            encoding = encoding[1] unless encoding.nil?
+-            if need_source_encoding_update?(encoding)
+-              @source.encoding = encoding
+-            end
+-            if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
+-              encoding = "UTF-16"
+-            end
+-            standalone = STANDALONE.match(results)
+-            standalone = standalone[1] unless standalone.nil?
+-            return [ :xmldecl, version, encoding, standalone ]
+-          when INSTRUCTION_START
++          if @source.match("<?", true)
+             return process_instruction
+-          when DOCTYPE_START
+-            base_error_message = "Malformed DOCTYPE"
+-            @source.match(DOCTYPE_START, true)
+-            @nsstack.unshift(curr_ns=Set.new)
+-            name = parse_name(base_error_message)
+-            if @source.match(/\A\s*\[/um, true)
+-              id = [nil, nil, nil]
+-              @document_status = :in_doctype
+-            elsif @source.match(/\A\s*>/um, true)
+-              id = [nil, nil, nil]
+-              @document_status = :after_doctype
+-            else
+-              id = parse_id(base_error_message,
+-                            accept_external_id: true,
+-                            accept_public_id: false)
+-              if id[0] == "SYSTEM"
+-                # For backward compatibility
+-                id[1], id[2] = id[2], nil
++          elsif @source.match("<!", true)
++            if @source.match("--", true)
++              return [ :comment, @source.match(/(.*?)-->/um, true)[1] ]
++            elsif @source.match("DOCTYPE", true)
++              base_error_message = "Malformed DOCTYPE"
++              unless @source.match(/\s+/um, true)
++                if @source.match(">")
++                  message = "#{base_error_message}: name is missing"
++                else
++                  message = "#{base_error_message}: invalid name"
++                end
++                @source.string = "<!DOCTYPE" + @source.buffer
++                raise REXML::ParseException.new(message, @source)
+               end
+-              if @source.match(/\A\s*\[/um, true)
+-               @document_status = :in_doctype
+-              elsif @source.match(/\A\s*>/um, true)
++              @nsstack.unshift(curr_ns=Set.new)
++              name = parse_name(base_error_message)
++              if @source.match(/\s*\[/um, true)
++                id = [nil, nil, nil]
++                @document_status = :in_doctype
++              elsif @source.match(/\s*>/um, true)
++                id = [nil, nil, nil]
+                 @document_status = :after_doctype
+               else
+-                message = "#{base_error_message}: garbage after external ID"
+-                raise REXML::ParseException.new(message, @source)
++                id = parse_id(base_error_message,
++                              accept_external_id: true,
++                              accept_public_id: false)
++                if id[0] == "SYSTEM"
++                  # For backward compatibility
++                  id[1], id[2] = id[2], nil
++                end
++                if @source.match(/\s*\[/um, true)
++                  @document_status = :in_doctype
++                elsif @source.match(/\s*>/um, true)
++                  @document_status = :after_doctype
++                else
++                  message = "#{base_error_message}: garbage after external ID"
++                  raise REXML::ParseException.new(message, @source)
++                end
+               end
+-            end
+-            args = [:start_doctype, name, *id]
+-            if @document_status == :after_doctype
+-              @source.match(/\A\s*/um, true)
+-              @stack << [ :end_doctype ]
+-            end
+-            return args
+-          when /\A\s+/
+-          else
+-            @document_status = :after_doctype
+-            if @source.encoding == "UTF-8"
+-              @source.buffer_encoding = ::Encoding::UTF_8
++              args = [:start_doctype, name, *id]
++              if @document_status == :after_doctype
++                @source.match(/\s*/um, true)
++                @stack << [ :end_doctype ]
++              end
++              return args
++            else
++              message = "Invalid XML"
++              raise REXML::ParseException.new(message, @source)
+             end
+           end
+         end
+         if @document_status == :in_doctype
+-          md = @source.match(/\A\s*(.*?>)/um)
+-          case md[1]
+-          when SYSTEMENTITY
+-            match = @source.match( SYSTEMENTITY, true )[1]
+-            return [ :externalentity, match ]
+-
+-          when ELEMENTDECL_START
+-            return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
+-
+-          when ENTITY_START
+-            match = [:entitydecl, *@source.match( ENTITYDECL, true ).captures.compact]
+-            ref = false
+-            if match[1] == '%'
+-              ref = true
+-              match.delete_at 1
+-            end
+-            # Now we have to sort out what kind of entity reference this is
+-            if match[2] == 'SYSTEM'
+-              # External reference
+-              match[3] = match[3][1..-2] # PUBID
+-              match.delete_at(4) if match.size > 4 # Chop out NDATA decl
+-              # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
+-            elsif match[2] == 'PUBLIC'
+-              # External reference
+-              match[3] = match[3][1..-2] # PUBID
+-              match[4] = match[4][1..-2] # HREF
+-              match.delete_at(5) if match.size > 5 # Chop out NDATA decl
+-              # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
+-            else
+-              match[2] = match[2][1..-2]
+-              match.pop if match.size == 4
+-              # match is [ :entity, name, value ]
+-            end
+-            match << '%' if ref
+-            return match
+-          when ATTLISTDECL_START
+-            md = @source.match( ATTLISTDECL_PATTERN, true )
+-            raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
+-            element = md[1]
+-            contents = md[0]
+-
+-            pairs = {}
+-            values = md[0].scan( ATTDEF_RE )
+-            values.each do |attdef|
+-              unless attdef[3] == "#IMPLIED"
+-                attdef.compact!
+-                val = attdef[3]
+-                val = attdef[4] if val == "#FIXED "
+-                pairs[attdef[0]] = val
+-                if attdef[0] =~ /^xmlns:(.*)/
+-                  @nsstack[0] << $1
+-                end
++          @source.match(/\s*/um, true) # skip spaces
++          if @source.match("<!", true)
++            if @source.match("ELEMENT", true)
++              md = @source.match(/(.*?)>/um, true)
++              raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
++              return [ :elementdecl, "<!ELEMENT" + md[1] ]
++            elsif @source.match("ENTITY", true)
++              match = [:entitydecl, *@source.match(ENTITYDECL_PATTERN, true).captures.compact]
++              ref = false
++              if match[1] == '%'
++                ref = true
++                match.delete_at 1
+               end
+-            end
+-            return [ :attlistdecl, element, pairs, contents ]
+-          when NOTATIONDECL_START
+-            base_error_message = "Malformed notation declaration"
+-            unless @source.match(/\A\s*<!NOTATION\s+/um, true)
+-              if @source.match(/\A\s*<!NOTATION\s*>/um)
+-                message = "#{base_error_message}: name is missing"
++              # Now we have to sort out what kind of entity reference this is
++              if match[2] == 'SYSTEM'
++                # External reference
++                match[3] = match[3][1..-2] # PUBID
++                match.delete_at(4) if match.size > 4 # Chop out NDATA decl
++                # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
++              elsif match[2] == 'PUBLIC'
++                # External reference
++                match[3] = match[3][1..-2] # PUBID
++                match[4] = match[4][1..-2] # HREF
++                match.delete_at(5) if match.size > 5 # Chop out NDATA decl
++                # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
+               else
+-                message = "#{base_error_message}: invalid declaration name"
++                match[2] = match[2][1..-2]
++                match.pop if match.size == 4
++                # match is [ :entity, name, value ]
+               end
+-              raise REXML::ParseException.new(message, @source)
+-            end
+-            name = parse_name(base_error_message)
+-            id = parse_id(base_error_message,
+-                          accept_external_id: true,
+-                          accept_public_id: true)
+-            unless @source.match(/\A\s*>/um, true)
+-              message = "#{base_error_message}: garbage before end >"
+-              raise REXML::ParseException.new(message, @source)
++              match << '%' if ref
++              return match
++            elsif @source.match("ATTLIST", true)
++              md = @source.match(ATTLISTDECL_END, true)
++              raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
++              element = md[1]
++              contents = md[0]
++
++              pairs = {}
++              values = md[0].scan( ATTDEF_RE )
++              values.each do |attdef|
++                unless attdef[3] == "#IMPLIED"
++                  attdef.compact!
++                  val = attdef[3]
++                  val = attdef[4] if val == "#FIXED "
++                  pairs[attdef[0]] = val
++                  if attdef[0] =~ /^xmlns:(.*)/
++                    @nsstack[0] << $1
++                  end
++                end
++              end
++              return [ :attlistdecl, element, pairs, contents ]
++            elsif @source.match("NOTATION", true)
++              base_error_message = "Malformed notation declaration"
++              unless @source.match(/\s+/um, true)
++                if @source.match(">")
++                  message = "#{base_error_message}: name is missing"
++                else
++                  message = "#{base_error_message}: invalid name"
++                end
++                @source.string = " <!NOTATION" + @source.buffer
++                raise REXML::ParseException.new(message, @source)
++              end
++              name = parse_name(base_error_message)
++              id = parse_id(base_error_message,
++                            accept_external_id: true,
++                            accept_public_id: true)
++              unless @source.match(/\s*>/um, true)
++                message = "#{base_error_message}: garbage before end >"
++                raise REXML::ParseException.new(message, @source)
++              end
++              return [:notationdecl, name, *id]
++            elsif md = @source.match(/--(.*?)-->/um, true)
++              case md[1]
++              when /--/, /-\z/
++                raise REXML::ParseException.new("Malformed comment", @source)
++              end
++              return [ :comment, md[1] ] if md
+             end
+-            return [:notationdecl, name, *id]
+-          when DOCTYPE_END
++          elsif match = @source.match(/(%.*?;)\s*/um, true)
++            return [ :externalentity, match[1] ]
++          elsif @source.match(/\]\s*>/um, true)
+             @document_status = :after_doctype
+-            @source.match( DOCTYPE_END, true )
+             return [ :end_doctype ]
+           end
+         end
+         if @document_status == :after_doctype
+-          @source.match(/\A\s*/um, true)
++          @source.match(/\s*/um, true)
+         end
+         begin
+-          @source.read if @source.buffer.size<2
+-          if @source.buffer[0] == ?<
+-            if @source.buffer[1] == ?/
++          if @source.match("<", true)
++            if @source.match("/", true)
+               @nsstack.shift
+               last_tag = @tags.pop
+-              #md = @source.match_to_consume( '>', CLOSE_MATCH)
+-              md = @source.match( CLOSE_MATCH, true )
++              md = @source.match(CLOSE_PATTERN, true)
+               if md and !last_tag
+                 message = "Unexpected top-level end tag (got '#{md[1]}')"
+                 raise REXML::ParseException.new(message, @source)
+               end
+               if md.nil? or last_tag != md[1]
+                 message = "Missing end tag for '#{last_tag}'"
+-                message << " (got '#{md[1]}')" if md
++                message += " (got '#{md[1]}')" if md
++                @source.string = "</" + @source.buffer if md.nil?
+                 raise REXML::ParseException.new(message, @source)
+               end
+               return [ :end_element, last_tag ]
+-            elsif @source.buffer[1] == ?!
+-              md = @source.match(/\A(\s*[^>]*>)/um)
++            elsif @source.match("!", true)
++              md = @source.match(/([^>]*>)/um)
+               #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
+               raise REXML::ParseException.new("Malformed node", @source) unless md
+-              if md[0][2] == ?-
+-                md = @source.match( COMMENT_PATTERN, true )
++              if md[0][0] == ?-
++                md = @source.match(/--(.*?)-->/um, true)
+ 
+                 case md[1]
+                 when /--/, /-\z/
+@@ -388,19 +393,18 @@ def pull_event
+ 
+                 return [ :comment, md[1] ] if md
+               else
+-                md = @source.match( CDATA_PATTERN, true )
++                md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
+                 return [ :cdata, md[1] ] if md
+               end
+               raise REXML::ParseException.new( "Declarations can only occur "+
+                 "in the doctype declaration.", @source)
+-            elsif @source.buffer[1] == ??
++            elsif @source.match("?", true)
+               return process_instruction
+             else
+               # Get the next tag
+-              md = @source.match(TAG_MATCH, true)
++              md = @source.match(TAG_PATTERN, true)
+               unless md
+-                # Check for missing attribute quotes
+-                raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES )
++                @source.string = "<" + @source.buffer
+                 raise REXML::ParseException.new("malformed XML: missing tag start", @source)
+               end
+               tag = md[1]
+@@ -425,7 +429,7 @@ def pull_event
+               return [ :start_element, tag, attributes ]
+             end
+           else
+-            md = @source.match( TEXT_PATTERN, true )
++            md = @source.match(/([^<]*)/um, true)
+             text = md[1]
+             if md[0].length == 0
+               @source.match( /(\s+)/, true )
+@@ -472,8 +476,7 @@ def normalize( input, entities=nil, entity_filter=nil )
+ 
+       # Unescapes all possible entities
+       def unnormalize( string, entities=nil, filter=nil )
+-        rv = string.clone
+-        rv.gsub!( /\r\n?/, "\n" )
++        rv = string.gsub( /\r\n?/, "\n" )
+         matches = rv.scan( REFERENCE_RE )
+         return rv if matches.size == 0
+         rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
+@@ -508,9 +511,9 @@ def need_source_encoding_update?(xml_declaration_encoding)
+       end
+ 
+       def parse_name(base_error_message)
+-        md = @source.match(/\A\s*#{NAME}/um, true)
++        md = @source.match(NAME_PATTERN, true)
+         unless md
+-          if @source.match(/\A\s*\S/um)
++          if @source.match(/\s*\S/um)
+             message = "#{base_error_message}: invalid name"
+           else
+             message = "#{base_error_message}: name is missing"
+@@ -671,12 +674,29 @@ def parse_attributes(prefixes, curr_ns)
+       end
+ 
+       def process_instruction
+-        match_data = @source.match(INSTRUCTION_PATTERN, true)
++        match_data = @source.match(INSTRUCTION_END, true)
+         unless match_data
+           message = "Invalid processing instruction node"
++          @source.string = "<?" + @source.buffer
+           raise REXML::ParseException.new(message, @source)
+         end
+-        [:processing_instruction, match_data[1], match_data[4]]
++        if @document_status.nil? and match_data[1] == "xml"
++          content = match_data[2]
++          version = VERSION.match(content)
++          version = version[1] unless version.nil?
++          encoding = ENCODING.match(content)
++          encoding = encoding[1] unless encoding.nil?
++          if need_source_encoding_update?(encoding)
++            @source.encoding = encoding
++          end
++          if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
++            encoding = "UTF-16"
++          end
++          standalone = STANDALONE.match(content)
++          standalone = standalone[1] unless standalone.nil?
++          return [ :xmldecl, version, encoding, standalone ]
++        end
++        [:processing_instruction, match_data[1], match_data[2]]
+       end
+     end
+   end
+diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
+index 4624777fa0..d2a6ad872e 100644
+--- a/lib/rexml/source.rb
++++ b/lib/rexml/source.rb
+@@ -76,6 +76,10 @@ def match(pattern, cons=false)
+       end
+     end
+ 
++    def string=(string)
++      @scanner.string = string
++    end
++
+     # @return true if the Source is exhausted
+     def empty?
+       @scanner.eos?
+@@ -150,28 +154,25 @@ def initialize(arg, block_size=500, encoding=nil)
+     def read
+       begin
+         @scanner << readline
++        true
+       rescue Exception, NameError
+         @source = nil
++        false
+       end
+     end
+ 
+     def match( pattern, cons=false )
+-      if cons
+-        md = @scanner.scan(pattern)
+-      else
+-        md = @scanner.check(pattern)
+-      end
+-      while md.nil? and @source
+-        begin
+-          @scanner << readline
+-          if cons
+-            md = @scanner.scan(pattern)
+-          else
+-            md = @scanner.check(pattern)
+-          end
+-        rescue
+-          @source = nil
++      read if @scanner.eos? && @source
++      while true
++        if cons
++          md = @scanner.scan(pattern)
++        else
++          md = @scanner.check(pattern)
+         end
++        break if md
++        return nil if pattern.is_a?(String) && pattern.bytesize <= @scanner.rest_size
++        return nil if @source.nil?
++        return nil unless read
+       end
+ 
+       md.nil? ? nil : @scanner
+diff --git a/test/rexml/parse/test_document_type_declaration.rb b/test/rexml/parse/test_document_type_declaration.rb
+index 55713909e7..8faa0b78be 100644
+--- a/test/rexml/parse/test_document_type_declaration.rb
++++ b/test/rexml/parse/test_document_type_declaration.rb
+@@ -36,6 +36,21 @@ def test_garbage_plus_before_name_at_line_start
+ + r SYSTEM "urn:x-rexml:test" [ ]>  <r/> 
+         DETAIL
+       end
++
++      def test_no_name
++        exception = assert_raise(REXML::ParseException) do
++          parse(<<-DOCTYPE)
++<!DOCTYPE>
++          DOCTYPE
++        end
++        assert_equal(<<-DETAIL.chomp, exception.to_s)
++Malformed DOCTYPE: name is missing
++Line: 3
++Position: 17
++Last 80 unconsumed characters:
++<!DOCTYPE>  <r/> 
++        DETAIL
++      end
+     end
+ 
+     class TestExternalID < self
+
+From 0496940d5998ccbc50d16fb734993ab50fc60c2d Mon Sep 17 00:00:00 2001
+From: NAITOH Jun <naitoh@gmail.com>
+Date: Mon, 18 Mar 2024 23:30:47 +0900
+Subject: [PATCH 5/7]  Optimize the parse_attributes method to use
+ `Source#match` to parse XML.  (#119)
+
+## Why?
+
+Improve maintainability by consolidating processing into `Source#match`.
+
+## Benchmark
+```
+RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml
+ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22]
+Calculating -------------------------------------
+                         before       after  before(YJIT)  after(YJIT)
+                 dom     10.891      10.622        16.356       17.403 i/s -     100.000 times in 9.182130s 9.414177s 6.113806s 5.746133s
+                 sax     30.335      29.845        49.749       54.877 i/s -     100.000 times in 3.296483s 3.350595s 2.010071s 1.822259s
+                pull     35.514      34.801        61.123       66.908 i/s -     100.000 times in 2.815793s 2.873484s 1.636041s 1.494591s
+              stream     35.141      34.475        52.110       56.836 i/s -     100.000 times in 2.845646s 2.900638s 1.919017s 1.759456s
+
+Comparison:
+                              dom
+         after(YJIT):        17.4 i/s
+        before(YJIT):        16.4 i/s - 1.06x  slower
+              before:        10.9 i/s - 1.60x  slower
+               after:        10.6 i/s - 1.64x  slower
+
+                              sax
+         after(YJIT):        54.9 i/s
+        before(YJIT):        49.7 i/s - 1.10x  slower
+              before:        30.3 i/s - 1.81x  slower
+               after:        29.8 i/s - 1.84x  slower
+
+                             pull
+         after(YJIT):        66.9 i/s
+        before(YJIT):        61.1 i/s - 1.09x  slower
+              before:        35.5 i/s - 1.88x  slower
+               after:        34.8 i/s - 1.92x  slower
+
+                           stream
+         after(YJIT):        56.8 i/s
+        before(YJIT):        52.1 i/s - 1.09x  slower
+              before:        35.1 i/s - 1.62x  slower
+               after:        34.5 i/s - 1.65x  slower
+
+```
+
+- YJIT=ON : 1.06x - 1.10x faster
+- YJIT=OFF : 0.97x - 0.98x faster
+===
+"test_attribute_namespace_conflict" in test/rexml/test_core.rb
+adjustment not applied as the commit that added the test is not present.
+
+Wider use of Source#match to parse XML.
+
+Changes the way BaseParser#parse_attribute
+grabs fields that make up an XML attribute.
+
+This is relied upon by upstream 4325835f92f3f142ebd91a3fdba4e1f1ab7f1cfb
+that actually fixes CVE-2024-35176.
+
+https://github.com/ruby/rexml/commit/0496940d5998ccbc50d16fb734993ab50fc60c2d
+---
+ lib/rexml/parsers/baseparser.rb  | 113 ++++++++++++-------------------
+ test/rexml/parse/test_element.rb |   2 +-
+ test/rexml/test_core.rb          |  17 ++++-
+ 3 files changed, 61 insertions(+), 71 deletions(-)
+
+diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
+index dafc1b1449..45eabe8cc5 100644
+--- a/lib/rexml/parsers/baseparser.rb
++++ b/lib/rexml/parsers/baseparser.rb
+@@ -123,7 +123,7 @@ class BaseParser
+ 
+       module Private
+         INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
+-        TAG_PATTERN = /((?>#{QNAME_STR}))/um
++        TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
+         CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
+         ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
+         NAME_PATTERN = /\s*#{NAME}/um
+@@ -592,85 +592,60 @@ def parse_id_invalid_details(accept_external_id:,
+       def parse_attributes(prefixes, curr_ns)
+         attributes = {}
+         closed = false
+-        match_data = @source.match(/^(.*?)(\/)?>/um, true)
+-        if match_data.nil?
+-          message = "Start tag isn't ended"
+-          raise REXML::ParseException.new(message, @source)
+-        end
+-
+-        raw_attributes = match_data[1]
+-        closed = !match_data[2].nil?
+-        return attributes, closed if raw_attributes.nil?
+-        return attributes, closed if raw_attributes.empty?
+-
+-        scanner = StringScanner.new(raw_attributes)
+-        until scanner.eos?
+-          if scanner.scan(/\s+/)
+-            break if scanner.eos?
+-          end
+-
+-          pos = scanner.pos
+-          loop do
+-            break if scanner.scan(ATTRIBUTE_PATTERN)
+-            unless scanner.scan(QNAME)
+-              message = "Invalid attribute name: <#{scanner.rest}>"
+-              raise REXML::ParseException.new(message, @source)
+-            end
+-            name = scanner[0]
+-            unless scanner.scan(/\s*=\s*/um)
++        while true
++          if @source.match(">", true)
++            return attributes, closed
++          elsif @source.match("/>", true)
++            closed = true
++            return attributes, closed
++          elsif match = @source.match(QNAME, true)
++            name = match[1]
++            prefix = match[2]
++            local_part = match[3]
++
++            unless @source.match(/\s*=\s*/um, true)
+               message = "Missing attribute equal: <#{name}>"
+               raise REXML::ParseException.new(message, @source)
+             end
+-            quote = scanner.scan(/['"]/)
+-            unless quote
+-              message = "Missing attribute value start quote: <#{name}>"
+-              raise REXML::ParseException.new(message, @source)
+-            end
+-            unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
+-              match_data = @source.match(/^(.*?)(\/)?>/um, true)
+-              if match_data
+-                scanner << "/" if closed
+-                scanner << ">"
+-                scanner << match_data[1]
+-                scanner.pos = pos
+-                closed = !match_data[2].nil?
+-                next
++            unless match = @source.match(/(['"])(.*?)\1\s*/um, true)
++              if match = @source.match(/(['"])/, true)
++                message =
++                  "Missing attribute value end quote: <#{name}>: <#{match[1]}>"
++                raise REXML::ParseException.new(message, @source)
++              else
++                message = "Missing attribute value start quote: <#{name}>"
++                raise REXML::ParseException.new(message, @source)
+               end
+-              message =
+-                "Missing attribute value end quote: <#{name}>: <#{quote}>"
+-              raise REXML::ParseException.new(message, @source)
+             end
+-          end
+-          name = scanner[1]
+-          prefix = scanner[2]
+-          local_part = scanner[3]
+-          # quote = scanner[4]
+-          value = scanner[5]
+-          if prefix == "xmlns"
+-            if local_part == "xml"
+-              if value != "http://www.w3.org/XML/1998/namespace"
+-                msg = "The 'xml' prefix must not be bound to any other namespace "+
++            value = match[2]
++            if prefix == "xmlns"
++              if local_part == "xml"
++                if value != "http://www.w3.org/XML/1998/namespace"
++                  msg = "The 'xml' prefix must not be bound to any other namespace "+
++                    "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
++                  raise REXML::ParseException.new( msg, @source, self )
++                end
++              elsif local_part == "xmlns"
++                msg = "The 'xmlns' prefix must not be declared "+
+                   "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
+-                raise REXML::ParseException.new( msg, @source, self )
++                raise REXML::ParseException.new( msg, @source, self)
+               end
+-            elsif local_part == "xmlns"
+-              msg = "The 'xmlns' prefix must not be declared "+
+-                "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
+-              raise REXML::ParseException.new( msg, @source, self)
++              curr_ns << local_part
++            elsif prefix
++              prefixes << prefix unless prefix == "xml"
+             end
+-            curr_ns << local_part
+-          elsif prefix
+-            prefixes << prefix unless prefix == "xml"
+-          end
+ 
+-          if attributes.has_key?(name)
+-            msg = "Duplicate attribute #{name.inspect}"
+-            raise REXML::ParseException.new(msg, @source, self)
+-          end
++            if attributes.has_key?(name)
++              msg = "Duplicate attribute #{name.inspect}"
++              raise REXML::ParseException.new(msg, @source, self)
++            end
+ 
+-          attributes[name] = value
++            attributes[name] = value
++          else
++            message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
++            raise REXML::ParseException.new(message, @source)
++          end
+         end
+-        return attributes, closed
+       end
+ 
+       def process_instruction
+diff --git a/test/rexml/parse/test_element.rb b/test/rexml/parse/test_element.rb
+index e8dce4b997..987214f3bb 100644
+--- a/test/rexml/parse/test_element.rb
++++ b/test/rexml/parse/test_element.rb
+@@ -43,7 +43,7 @@ def test_empty_namespace_attribute_name
+ Line: 1
+ Position: 13
+ Last 80 unconsumed characters:
+-
++:a=""></x>
+         DETAIL
+       end
+ 
+diff --git a/test/rexml/test_core.rb b/test/rexml/test_core.rb
+index d4ece491b9..46de90ccd7 100644
+--- a/test/rexml/test_core.rb
++++ b/test/rexml/test_core.rb
+@@ -1277,11 +1277,26 @@ def test_ticket_21
+       exception = assert_raise(ParseException) do
+         Document.new(src)
+       end
+-      assert_equal(<<-DETAIL, exception.to_s)
++      assert_equal(<<-DETAIL.chomp, exception.to_s)
+ Missing attribute value start quote: <bar>
+ Line: 1
+ Position: 16
+ Last 80 unconsumed characters:
++value/>
++      DETAIL
++    end
++
++    def test_parse_exception_on_missing_attribute_end_quote
++      src = '<foo bar="value/>'
++      exception = assert_raise(ParseException) do
++        Document.new(src)
++      end
++      assert_equal(<<-DETAIL.chomp, exception.to_s)
++Missing attribute value end quote: <bar>: <">
++Line: 1
++Position: 17
++Last 80 unconsumed characters:
++value/>
+       DETAIL
+     end
+ 
+
+From 4325835f92f3f142ebd91a3fdba4e1f1ab7f1cfb Mon Sep 17 00:00:00 2001
+From: Nobuyoshi Nakada <nobu@ruby-lang.org>
+Date: Thu, 16 May 2024 11:26:51 +0900
+Subject: [PATCH 6/7] Read quoted attributes in chunks (#126)
+
+===
+test/rexml/test_document.rb:
+* test_gt_linear_performance
+Test not included, Ruby 2.5 does not have assert_linear_performance
+method available in the test suite. It was introduced in Ruby 2.7.
+* Do not require 'core_assertions'
+Current requires are fine for running the tests in mock build.
+
+The fix for CVE-2024-35176. By reading the field in chunks instead of 1
+by 1 it improves performance for large strings composed of angle
+brackets.
+
+https://github.com/ruby/rexml/commit/4325835f92f3f142ebd91a3fdba4e1f1ab7f1cfb
+---
+ lib/rexml/parsers/baseparser.rb | 20 ++++++++++----------
+ lib/rexml/source.rb             | 29 ++++++++++++++++++++++++-----
+ 2 files changed, 34 insertions(+), 15 deletions(-)
+
+diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
+index 45eabe8cc5..304835e407 100644
+--- a/lib/rexml/parsers/baseparser.rb
++++ b/lib/rexml/parsers/baseparser.rb
+@@ -607,17 +607,17 @@ def parse_attributes(prefixes, curr_ns)
+               message = "Missing attribute equal: <#{name}>"
+               raise REXML::ParseException.new(message, @source)
+             end
+-            unless match = @source.match(/(['"])(.*?)\1\s*/um, true)
+-              if match = @source.match(/(['"])/, true)
+-                message =
+-                  "Missing attribute value end quote: <#{name}>: <#{match[1]}>"
+-                raise REXML::ParseException.new(message, @source)
+-              else
+-                message = "Missing attribute value start quote: <#{name}>"
+-                raise REXML::ParseException.new(message, @source)
+-              end
++            unless match = @source.match(/(['"])/, true)
++              message = "Missing attribute value start quote: <#{name}>"
++              raise REXML::ParseException.new(message, @source)
++            end
++            quote = match[1]
++            value = @source.read_until(quote)
++            unless value.chomp!(quote)
++              message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
++              raise REXML::ParseException.new(message, @source)
+             end
+-            value = match[2]
++            @source.match(/\s*/um, true)
+             if prefix == "xmlns"
+               if local_part == "xml"
+                 if value != "http://www.w3.org/XML/1998/namespace"
+diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
+index d2a6ad872e..2a3bd36695 100644
+--- a/lib/rexml/source.rb
++++ b/lib/rexml/source.rb
+@@ -65,7 +65,11 @@ def encoding=(enc)
+       encoding_updated
+     end
+ 
+-    def read
++    def read(term = nil)
++    end
++
++    def read_until(term)
++      @scanner.scan_until(Regexp.union(term)) or @scanner.rest
+     end
+ 
+     def match(pattern, cons=false)
+@@ -151,9 +155,9 @@ def initialize(arg, block_size=500, encoding=nil)
+       end
+     end
+ 
+-    def read
++    def read(term = nil)
+       begin
+-        @scanner << readline
++        @scanner << readline(term)
+         true
+       rescue Exception, NameError
+         @source = nil
+@@ -161,6 +165,21 @@ def read
+       end
+     end
+ 
++    def read_until(term)
++      pattern = Regexp.union(term)
++      data = []
++      begin
++        until str = @scanner.scan_until(pattern)
++          @scanner << readline(term)
++        end
++      rescue EOFError
++        @scanner.rest
++      else
++        read if @scanner.eos? and !@source.eof?
++        str
++      end
++    end
++
+     def match( pattern, cons=false )
+       read if @scanner.eos? && @source
+       while true
+@@ -204,8 +223,8 @@ def current_line
+     end
+ 
+     private
+-    def readline
+-      str = @source.readline(@line_break)
++    def readline(term = nil)
++      str = @source.readline(term || @line_break)
+       if @pending_buffer
+         if str.nil?
+           str = @pending_buffer
+
+From f1df7d13b3e57a5e059273d2f0870163c08d7420 Mon Sep 17 00:00:00 2001
+From: Sutou Kouhei <kou@clear-code.com>
+Date: Mon, 20 May 2024 12:17:27 +0900
+Subject: [PATCH 7/7] Add support for old strscan
+
+Fix GH-132
+
+If we support old strscan, users can also use strscan installed as a
+default gem.
+
+Reported by Adam. Thanks!!!
+
+===
+strscan shipped with Ruby 2.5 lacks a method StringScanner#captures.
+ruby/rexml#105 brought this dependency on the method.
+
+https://bugs.ruby-lang.org/issues/20516#note-11
+https://github.com/ruby/rexml/commit/f1df7d13b3e57a5e059273d2f0870163c08d7420
+---
+ lib/rexml/parsers/baseparser.rb | 11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
+index 304835e407..006d8acb9b 100644
+--- a/lib/rexml/parsers/baseparser.rb
++++ b/lib/rexml/parsers/baseparser.rb
+@@ -9,6 +9,17 @@
+ 
+ module REXML
+   module Parsers
++    if StringScanner::Version < "3.0.8"
++      module StringScannerCaptures
++        refine StringScanner do
++          def captures
++            values_at(*(1...size))
++          end
++        end
++      end
++      using StringScannerCaptures
++    end
++
+     # = Using the Pull Parser
+     # <em>This API is experimental, and subject to change.</em>
+     #  parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
diff --git a/rubygem-strscan-1.0.2-Accept-String-as-a-pattern.patch b/rubygem-strscan-1.0.2-Accept-String-as-a-pattern.patch
new file mode 100644
index 0000000..59e734f
--- /dev/null
+++ b/rubygem-strscan-1.0.2-Accept-String-as-a-pattern.patch
@@ -0,0 +1,264 @@
+From e56ac27d19cc3acdf6c1cb13b14224c43df5f5f6 Mon Sep 17 00:00:00 2001
+From: Kouhei Sutou <kou@clear-code.com>
+Date: Thu, 4 Apr 2019 17:52:50 +0900
+Subject: [PATCH] Accept String as a pattern
+
+It's only for head only match case such as StringScanner#scan.
+
+If we use a String as a pattern, we can improve match performance.
+Here is a result of the including benchmark. It shows String as a
+pattern is 1.25x faster than Regexp as a pattern.
+
+    % rake benchmark
+    /tmp/local/bin/ruby -S benchmark-driver benchmark/scan.yaml
+    Warming up --------------------------------------
+                  regexp    12.094M i/s -     12.242M times in 1.012250s (82.69ns/i, 277clocks/i)
+                  string    14.653M i/s -     14.889M times in 1.016124s (68.25ns/i, 252clocks/i)
+    Calculating -------------------------------------
+                  regexp    14.713M i/s -     36.281M times in 2.465970s (67.97ns/i, 254clocks/i)
+                  string    18.422M i/s -     43.959M times in 2.386255s (54.28ns/i, 201clocks/i)
+
+    Comparison:
+                  string:  18421631.8 i/s
+                  regexp:  14712660.7 i/s - 1.25x  slower
+
+====
+Backport https://github.com/ruby/strscan/pull/4 for strscan.
+
+REXML fixes for CVE-2024-35716 depend on this feature.
+---
+ ext/strscan/strscan.c              | 92 +++++++++++++++++++-----------
+ test/strscan/test_stringscanner.rb | 45 ++++++++++++++-
+ 2 files changed, 100 insertions(+), 37 deletions(-)
+
+diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c
+index d6168a0d4f..43319b672e 100644
+--- a/ext/strscan/strscan.c
++++ b/ext/strscan/strscan.c
+@@ -447,15 +447,18 @@ strscan_set_pos(VALUE self, VALUE v)
+ }
+ 
+ static VALUE
+-strscan_do_scan(VALUE self, VALUE regex, int succptr, int getstr, int headonly)
++strscan_do_scan(VALUE self, VALUE pattern, int succptr, int getstr, int headonly)
+ {
+-    regex_t *rb_reg_prepare_re(VALUE re, VALUE str);
+     struct strscanner *p;
+-    regex_t *re;
+-    long ret;
+-    int tmpreg;
+ 
+-    Check_Type(regex, T_REGEXP);
++    if (headonly) {
++        if (!RB_TYPE_P(pattern, T_REGEXP)) {
++            StringValue(pattern);
++        }
++    }
++    else {
++        Check_Type(pattern, T_REGEXP);
++    }
+     GET_SCANNER(self, p);
+ 
+     CLEAR_MATCH_STATUS(p);
+@@ -463,37 +466,55 @@ strscan_do_scan(VALUE self, VALUE regex, int succptr, int getstr, int headonly)
+         return Qnil;
+     }
+ 
+-    p->regex = regex;
+-    re = rb_reg_prepare_re(regex, p->str);
+-    tmpreg = re != RREGEXP_PTR(regex);
+-    if (!tmpreg) RREGEXP(regex)->usecnt++;
++    if (RB_TYPE_P(pattern, T_REGEXP)) {
++        regex_t *rb_reg_prepare_re(VALUE re, VALUE str);
++        regex_t *re;
++        long ret;
++        int tmpreg;
++
++        p->regex = pattern;
++        re = rb_reg_prepare_re(pattern, p->str);
++        tmpreg = re != RREGEXP_PTR(pattern);
++        if (!tmpreg) RREGEXP(pattern)->usecnt++;
++
++        if (headonly) {
++            ret = onig_match(re, (UChar* )CURPTR(p),
++                             (UChar* )(CURPTR(p) + S_RESTLEN(p)),
++                             (UChar* )CURPTR(p), &(p->regs), ONIG_OPTION_NONE);
++        }
++        else {
++            ret = onig_search(re,
++                              (UChar* )CURPTR(p), (UChar* )(CURPTR(p) + S_RESTLEN(p)),
++                              (UChar* )CURPTR(p), (UChar* )(CURPTR(p) + S_RESTLEN(p)),
++                              &(p->regs), ONIG_OPTION_NONE);
++        }
++        if (!tmpreg) RREGEXP(pattern)->usecnt--;
++        if (tmpreg) {
++            if (RREGEXP(pattern)->usecnt) {
++                onig_free(re);
++            }
++            else {
++                onig_free(RREGEXP_PTR(pattern));
++                RREGEXP_PTR(pattern) = re;
++            }
++        }
+ 
+-    if (headonly) {
+-        ret = onig_match(re, (UChar* )CURPTR(p),
+-                         (UChar* )(CURPTR(p) + S_RESTLEN(p)),
+-                         (UChar* )CURPTR(p), &(p->regs), ONIG_OPTION_NONE);
++        if (ret == -2) rb_raise(ScanError, "regexp buffer overflow");
++        if (ret < 0) {
++            /* not matched */
++            return Qnil;
++        }
+     }
+     else {
+-        ret = onig_search(re,
+-                          (UChar* )CURPTR(p), (UChar* )(CURPTR(p) + S_RESTLEN(p)),
+-                          (UChar* )CURPTR(p), (UChar* )(CURPTR(p) + S_RESTLEN(p)),
+-                          &(p->regs), ONIG_OPTION_NONE);
+-    }
+-    if (!tmpreg) RREGEXP(regex)->usecnt--;
+-    if (tmpreg) {
+-        if (RREGEXP(regex)->usecnt) {
+-            onig_free(re);
++        rb_enc_check(p->str, pattern);
++        if (S_RESTLEN(p) < RSTRING_LEN(pattern)) {
++            return Qnil;
+         }
+-        else {
+-            onig_free(RREGEXP_PTR(regex));
+-            RREGEXP_PTR(regex) = re;
++        if (memcmp(CURPTR(p), RSTRING_PTR(pattern), RSTRING_LEN(pattern)) != 0) {
++            return Qnil;
+         }
+-    }
+-
+-    if (ret == -2) rb_raise(ScanError, "regexp buffer overflow");
+-    if (ret < 0) {
+-        /* not matched */
+-        return Qnil;
++        onig_region_clear(&(p->regs));
++        onig_region_set(&(p->regs), 0, 0, RSTRING_LEN(pattern));
+     }
+ 
+     MATCHED(p);
+@@ -520,7 +541,8 @@ strscan_do_scan(VALUE self, VALUE regex, int succptr, int getstr, int headonly)
+  *   p s.scan(/\w+/)   # -> "test"
+  *   p s.scan(/\w+/)   # -> nil
+  *   p s.scan(/\s+/)   # -> " "
+- *   p s.scan(/\w+/)   # -> "string"
++ *   p s.scan("str")   # -> "str"
++ *   p s.scan(/\w+/)   # -> "ing"
+  *   p s.scan(/./)     # -> nil
+  *
+  */
+@@ -539,6 +561,7 @@ strscan_scan(VALUE self, VALUE re)
+  *   s = StringScanner.new('test string')
+  *   p s.match?(/\w+/)   # -> 4
+  *   p s.match?(/\w+/)   # -> 4
++ *   p s.match?("test")  # -> 4
+  *   p s.match?(/\s+/)   # -> nil
+  */
+ static VALUE
+@@ -560,7 +583,8 @@ strscan_match_p(VALUE self, VALUE re)
+  *   p s.skip(/\w+/)   # -> 4
+  *   p s.skip(/\w+/)   # -> nil
+  *   p s.skip(/\s+/)   # -> 1
+- *   p s.skip(/\w+/)   # -> 6
++ *   p s.skip("st")    # -> 2
++ *   p s.skip(/\w+/)   # -> 4
+  *   p s.skip(/./)     # -> nil
+  *
+  */
+diff --git a/test/strscan/test_stringscanner.rb b/test/strscan/test_stringscanner.rb
+index 3423f9cfed..63b1ce1f9b 100644
+--- a/test/strscan/test_stringscanner.rb
++++ b/test/strscan/test_stringscanner.rb
+@@ -282,6 +282,22 @@ def test_scan
+     assert_equal "", s.scan(//)
+   end
+ 
++  def test_scan_string
++    s = StringScanner.new('stra strb strc')
++    assert_equal 'str', s.scan('str')
++    assert_equal 'str', s[0]
++    assert_equal 3, s.pos
++    assert_equal false, s.tainted?
++    assert_equal 'a ', s.scan('a ')
++
++    str = 'stra strb strc'.dup
++    str.taint
++    s = StringScanner.new(str, false)
++    matched = s.scan('str')
++    assert_equal 'str', matched
++    assert_equal true, matched.tainted?
++  end
++
+   def test_skip
+     s = StringScanner.new('stra strb strc', true)
+     assert_equal 4, s.skip(/\w+/)
+@@ -367,8 +383,10 @@ def test_matched
+     assert_equal false, s.matched.tainted?
+     s.scan(/\s+/)
+     assert_equal ' ', s.matched
++    s.scan('st')
++    assert_equal 'st', s.matched
+     s.scan(/\w+/)
+-    assert_equal 'strb', s.matched
++    assert_equal 'rb', s.matched
+     s.scan(/\s+/)
+     assert_equal ' ', s.matched
+     s.scan(/\w+/)
+@@ -483,7 +501,7 @@ def test_pre_match
+     s.skip(/\s/)
+     assert_equal 'a', s.pre_match
+     assert_equal false, s.pre_match.tainted?
+-    s.scan(/\w/)
++    s.scan('b')
+     assert_equal 'a ', s.pre_match
+     s.scan_until(/c/)
+     assert_equal 'a b ', s.pre_match
+@@ -513,7 +531,7 @@ def test_post_match
+     assert_equal ' b c d e', s.post_match
+     s.skip(/\s/)
+     assert_equal 'b c d e', s.post_match
+-    s.scan(/\w/)
++    s.scan('b')
+     assert_equal ' c d e', s.post_match
+     s.scan_until(/c/)
+     assert_equal ' d e', s.post_match
+@@ -589,6 +607,20 @@ def test_encoding
+     assert_equal(Encoding::EUC_JP, ss.scan(/./e).encoding)
+   end
+ 
++  def test_encoding_string
++    str = "\xA1\xA2".dup.force_encoding("euc-jp")
++    ss = StringScanner.new(str)
++    assert_equal(str.dup, ss.scan(str.dup))
++  end
++
++  def test_invalid_encoding_string
++    str = "\xA1\xA2".dup.force_encoding("euc-jp")
++    ss = StringScanner.new(str)
++    assert_raise(Encoding::CompatibilityError) do
++      ss.scan(str.encode("UTF-8"))
++    end
++  end
++
+   def test_generic_regexp
+     ss = StringScanner.new("\xA1\xA2".dup.force_encoding("euc-jp"))
+     t = ss.scan(/./)
+@@ -643,6 +675,13 @@ def test_exist_p
+     assert_equal(nil, s.exist?(/e/))
+   end
+ 
++  def test_exist_p_string
++    s = StringScanner.new("test string")
++    assert_raise(TypeError) do
++      s.exist?(" ")
++    end
++  end
++
+   def test_skip_until
+     s = StringScanner.new("Foo Bar Baz")
+     assert_equal(3, s.skip_until(/Foo/))