python-markdown/python-markdown-3.5.1-fix-CVE-2025-69534.patch

From 3191e76a3e6d0b4a89bbf9ec52bbc84aa24e22b8 Mon Sep 17 00:00:00 2001
From: Waylan Limberg <waylan.limberg@icloud.com>
Date: Wed, 18 Jun 2025 10:29:03 -0400
Subject: [PATCH 1/4] Ensure incomplete markup declaration in raw HTML doesn't
 crash parser.

See Python bug report at gh-77057 for details. Until we drop support for
Python < 3.13 (where this was fixed upstream), we need to avoid the
unwanted error by checking for it explicitly. Fixes #1534.
---
 markdown/extensions/md_in_html.py            | 4 ++++
 markdown/htmlparser.py                       | 4 ++++
 tests/test_syntax/blocks/test_html_blocks.py | 7 +++++++
 3 files changed, 15 insertions(+)

diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py
index 982d603..99001ca 100644
--- a/markdown/extensions/md_in_html.py
+++ b/markdown/extensions/md_in_html.py
@@ -227,6 +227,10 @@ class HTMLExtractorExtra(HTMLExtractor):

     def parse_html_declaration(self, i):
         if self.at_line_start() or self.intail or self.mdstack:
+            if self.rawdata[i:i+3] == '<![' and not self.rawdata[i:i+9] == '<![CDATA[':
+                # We have encountered the bug in #1534 (Python bug `gh-77057`).
+                # Provide an override until we drop support for Python < 3.13.
+                return self.parse_bogus_comment(i)
             # The same override exists in `HTMLExtractor` without the check
             # for `mdstack`. Therefore, use parent of `HTMLExtractor` instead.
             return super(HTMLExtractor, self).parse_html_declaration(i)
diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py
index 29e2300..b9b9c6f 100644
--- a/markdown/htmlparser.py
+++ b/markdown/htmlparser.py
@@ -267,6 +267,10 @@ class HTMLExtractor(htmlparser.HTMLParser):

     def parse_html_declaration(self, i: int) -> int:
         if self.at_line_start() or self.intail:
+            if self.rawdata[i:i+3] == '<![' and not self.rawdata[i:i+9] == '<![CDATA[':
+                # We have encountered the bug in #1534 (Python bug `gh-77057`).
+                # Provide an override until we drop support for Python < 3.13.
+                return self.parse_bogus_comment(i)
             return super().parse_html_declaration(i)
         # This is not the beginning of a raw block so treat as plain data
         # and avoid consuming any tags which may follow (see #1066).
diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py
index 22b9498..e8c37b3 100644
--- a/tests/test_syntax/blocks/test_html_blocks.py
+++ b/tests/test_syntax/blocks/test_html_blocks.py
@@ -1275,6 +1275,13 @@ class TestHTMLBlocks(TestCase):
             )
         )

+    def test_not_actually_cdata(self):
+        # Ensure bug reported in #1534 is avoided.
+        self.assertMarkdownRenders(
+            '<![',
+            '<p>&lt;![</p>'
+        )
+
     def test_raw_cdata_code_span(self):
         self.assertMarkdownRenders(
             self.dedent(
--
2.54.0


From 1a38a252e759b6a7d86710640af2b7157feff51c Mon Sep 17 00:00:00 2001
From: Isaac Muse <faceless.shop@gmail.com>
Date: Thu, 19 Jun 2025 09:46:13 -0600
Subject: [PATCH 2/4] Fixes for Python 3.14

- Fix codecs deprecation
- Fix issue with unclosed `<![`
- Fix issue with unclosed HTML tag `<foo`
- Fix issue with unclosed comments
- Add tests which run on the Python 3.14 beta which should automatically update after release

Fixes #1537
---
 markdown/__main__.py              |  3 +--
 markdown/core.py                  |  2 +-
 markdown/extensions/md_in_html.py |  6 +++++-
 markdown/htmlparser.py            | 24 ++++++++++++++++++++++--
 6 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/markdown/__main__.py b/markdown/__main__.py
index c323aaa..259df63 100644
--- a/markdown/__main__.py
+++ b/markdown/__main__.py
@@ -21,7 +21,6 @@ from __future__ import annotations

 import sys
 import optparse
-import codecs
 import warnings
 import markdown
 try:
@@ -100,7 +99,7 @@ def parse_options(args=None, values=None):

     extension_configs = {}
     if options.configfile:
-        with codecs.open(
+        with open(
             options.configfile, mode="r", encoding=options.encoding
         ) as fp:
             try:
diff --git a/markdown/core.py b/markdown/core.py
index 6b556b4..e091b21 100644
--- a/markdown/core.py
+++ b/markdown/core.py
@@ -417,7 +417,7 @@ class Markdown:
         # Read the source
         if input:
             if isinstance(input, str):
-                input_file = codecs.open(input, mode="r", encoding=encoding)
+                input_file = open(input, mode="r", encoding=encoding)
             else:
                 input_file = codecs.getreader(encoding)(input)
             text = input_file.read()
diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py
index 99001ca..bf2a2fa 100644
--- a/markdown/extensions/md_in_html.py
+++ b/markdown/extensions/md_in_html.py
@@ -230,7 +230,11 @@ class HTMLExtractorExtra(HTMLExtractor):
             if self.rawdata[i:i+3] == '<![' and not self.rawdata[i:i+9] == '<![CDATA[':
                 # We have encountered the bug in #1534 (Python bug `gh-77057`).
                 # Provide an override until we drop support for Python < 3.13.
-                return self.parse_bogus_comment(i)
+                result = self.parse_bogus_comment(i)
+                if result == -1:
+                    self.handle_data(self.rawdata[i:i + 1])
+                    return i + 1
+                return result
             # The same override exists in `HTMLExtractor` without the check
             # for `mdstack`. Therefore, use parent of `HTMLExtractor` instead.
             return super(HTMLExtractor, self).parse_html_declaration(i)
diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py
index b9b9c6f..a4dd42d 100644
--- a/markdown/htmlparser.py
+++ b/markdown/htmlparser.py
@@ -85,6 +85,8 @@ class HTMLExtractor(htmlparser.HTMLParser):

         self.lineno_start_cache = [0]

+        self.override_comment_update = False
+
         # This calls self.reset
         super().__init__(*args, **kwargs)
         self.md = md
@@ -245,8 +247,21 @@ class HTMLExtractor(htmlparser.HTMLParser):
         self.handle_empty_tag('&{};'.format(name), is_block=False)

     def handle_comment(self, data: str):
+        # Check if the comment is unclosed, if so, we need to override position
+        i = self.line_offset + self.offset + len(data) + 4
+        if self.rawdata[i:i + 3] != '-->':
+            self.handle_data('<')
+            self.override_comment_update = True
+            return
         self.handle_empty_tag('<!--{}-->'.format(data), is_block=True)

+    def updatepos(self, i: int, j: int) -> int:
+        if self.override_comment_update:
+            self.override_comment_update = False
+            i = 0
+            j = 1
+        return super().updatepos(i, j)
+
     def handle_decl(self, data: str):
         self.handle_empty_tag('<!{}>'.format(data), is_block=True)

@@ -270,7 +285,11 @@ class HTMLExtractor(htmlparser.HTMLParser):
             if self.rawdata[i:i+3] == '<![' and not self.rawdata[i:i+9] == '<![CDATA[':
                 # We have encountered the bug in #1534 (Python bug `gh-77057`).
                 # Provide an override until we drop support for Python < 3.13.
-                return self.parse_bogus_comment(i)
+                result = self.parse_bogus_comment(i)
+                if result == -1:
+                    self.handle_data(self.rawdata[i:i + 1])
+                    return i + 1
+                return result
             return super().parse_html_declaration(i)
         # This is not the beginning of a raw block so treat as plain data
         # and avoid consuming any tags which may follow (see #1066).
@@ -291,7 +310,8 @@ class HTMLExtractor(htmlparser.HTMLParser):
         self.__starttag_text = None
         endpos = self.check_for_whole_start_tag(i)
         if endpos < 0:
-            return endpos
+            self.handle_data(self.rawdata[i:i + 1])
+            return i + 1
         rawdata = self.rawdata
         self.__starttag_text = rawdata[i:endpos]

--
2.54.0


From d4cfcb5c2b45634199b636bfac79600a1786c552 Mon Sep 17 00:00:00 2001
From: David King <dking@redhat.com>
Date: Tue, 28 Apr 2026 16:57:49 +0100
Subject: [PATCH 3/4] Backport upstream 3.10.1 to 3.10.2 fixes

---
 markdown/htmlparser.py                       | 74 ++++++++++++---
 markdown/inlinepatterns.py                   |  8 +-
 tests/test_syntax/blocks/test_html_blocks.py | 97 ++++++++++++++++++--
 3 files changed, 155 insertions(+), 24 deletions(-)

diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py
index a4dd42d..08f5a32 100644
--- a/markdown/htmlparser.py
+++ b/markdown/htmlparser.py
@@ -30,6 +30,9 @@ import importlib.util
 import sys


+# Included for versions which do not have current comment fix
+commentclose = re.compile(r'--!?>')
+
 # Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it.
 # Users can still do `from html import parser` and get the default behavior.
 spec = importlib.util.find_spec('html.parser')
@@ -37,6 +40,12 @@ htmlparser = importlib.util.module_from_spec(spec)
 spec.loader.exec_module(htmlparser)
 sys.modules['htmlparser'] = htmlparser

+# This is a hack. We are sneaking in `</>` so we can capture it without the HTML parser
+# throwing it away. When we see it, we will process it as data.
+htmlparser.starttagopen = re.compile('<[a-zA-Z]|</>')
+
+htmlparser.endtagopen = re.compile('</[a-zA-Z]?')
+
 # Monkeypatch `HTMLParser` to only accept `?>` to close Processing Instructions.
 htmlparser.piclose = re.compile(r'\?>')
 # Monkeypatch `HTMLParser` to only recognize entity references with a closing semicolon.
@@ -67,6 +76,30 @@ htmlparser.locatestarttagend_tolerant = re.compile(r"""
 blank_line_re = re.compile(r'^([ ]*\n){2}')


+class _HTMLParser(htmlparser.HTMLParser):
+    """Handle special start and end tags."""
+
+    def parse_endtag(self, i):
+        start = self.rawdata[i:i+3]
+        c = ord(start[-1])
+        if len(start) < 3 or not (65 <= c <= 90 or 97 <= c <= 122):
+            self.handle_data(self.rawdata[i:i + 2])
+            return i + 2
+        return super().parse_endtag(i)
+
+    def parse_starttag(self, i):  # pragma: no cover
+        # Treat `</>` as normal data as it is not a real tag.
+        if self.rawdata[i:i + 3] == '</>':
+            self.handle_data(self.rawdata[i:i + 3])
+            return i + 3
+
+        return super().parse_starttag(i)
+
+
+# Overwrite our custom one for people like MkDocs that pull it in
+htmlparser.HTMLParser = _HTMLParser
+
+
 class HTMLExtractor(htmlparser.HTMLParser):
     """
     Extract raw HTML from text.
@@ -85,8 +118,6 @@ class HTMLExtractor(htmlparser.HTMLParser):

         self.lineno_start_cache = [0]

-        self.override_comment_update = False
-
         # This calls self.reset
         super().__init__(*args, **kwargs)
         self.md = md
@@ -247,21 +278,8 @@ class HTMLExtractor(htmlparser.HTMLParser):
         self.handle_empty_tag('&{};'.format(name), is_block=False)

     def handle_comment(self, data: str):
-        # Check if the comment is unclosed, if so, we need to override position
-        i = self.line_offset + self.offset + len(data) + 4
-        if self.rawdata[i:i + 3] != '-->':
-            self.handle_data('<')
-            self.override_comment_update = True
-            return
         self.handle_empty_tag('<!--{}-->'.format(data), is_block=True)

-    def updatepos(self, i: int, j: int) -> int:
-        if self.override_comment_update:
-            self.override_comment_update = False
-            i = 0
-            j = 1
-        return super().updatepos(i, j)
-
     def handle_decl(self, data: str):
         self.handle_empty_tag('<!{}>'.format(data), is_block=True)

@@ -280,6 +298,18 @@ class HTMLExtractor(htmlparser.HTMLParser):
         self.handle_data('<?')
         return i + 2

+    def parse_comment(self, i, report=True):
+        rawdata = self.rawdata
+        assert rawdata.startswith('<!--', i), 'unexpected call to parse_comment()'
+        match = commentclose.search(rawdata, i+4)
+        if not match:
+            self.handle_data('<')
+            return i + 1
+        if report:
+            j = match.start()
+            self.handle_comment(rawdata[i+4: j])
+        return match.end()
+
     def parse_html_declaration(self, i: int) -> int:
         if self.at_line_start() or self.intail:
             if self.rawdata[i:i+3] == '<![' and not self.rawdata[i:i+9] == '<![CDATA[':
@@ -296,6 +326,15 @@ class HTMLExtractor(htmlparser.HTMLParser):
         self.handle_data('<!')
         return i + 2

+    def parse_bogus_comment(self, i, report=0):
+        # Override the default behavior so that bogus comments get passed
+        # through unaltered by setting `report` to `0` (see #1425).
+        pos = super().parse_bogus_comment(i, report)
+        if pos == -1:  # pragma: no cover
+            return -1
+        self.handle_empty_tag(self.rawdata[i:pos], is_block=False)
+        return pos
+
     # The rest has been copied from base class in standard lib to address #1036.
     # As `__startag_text` is private, all references to it must be in this subclass.
     # The last few lines of `parse_starttag` are reversed so that `handle_starttag`
@@ -307,6 +346,11 @@ class HTMLExtractor(htmlparser.HTMLParser):
         return self.__starttag_text

     def parse_starttag(self, i: int) -> int:  # pragma: no cover
+        # Treat `</>` as normal data as it is not a real tag.
+        if self.rawdata[i:i + 3] == '</>':
+            self.handle_data(self.rawdata[i:i + 3])
+            return i + 3
+
         self.__starttag_text = None
         endpos = self.check_for_whole_start_tag(i)
         if endpos < 0:
diff --git a/markdown/inlinepatterns.py b/markdown/inlinepatterns.py
index 296ab83..78310fc 100644
--- a/markdown/inlinepatterns.py
+++ b/markdown/inlinepatterns.py
@@ -161,7 +161,13 @@ AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^<>]*)>'
 AUTOMAIL_RE = r'<([^<> !]+@[^@<> ]+)>'
 """ Match an automatic email link (`<me@example.com>`). """

-HTML_RE = r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|!--(?:(?!<!--|-->).)*--)>)'
+HTML_RE = (
+    r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|'
+    r'!--(?:(?!<!--|-->).)*--|'
+    r'[?](?:(?!<[?]|[?]>).)*[?]|'
+    r'!\[CDATA\[(?:(?!<!\[CDATA\[|\]\]>).)*\]\]'
+    ')>)'
+)
 """ Match an HTML tag (`<...>`). """

 ENTITY_RE = r'(&(?:\#[0-9]+|\#x[0-9a-fA-F]+|[a-zA-Z0-9]+);)'
diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py
index e8c37b3..3b95ec7 100644
--- a/tests/test_syntax/blocks/test_html_blocks.py
+++ b/tests/test_syntax/blocks/test_html_blocks.py
@@ -782,16 +782,10 @@ class TestHTMLBlocks(TestCase):
             '<!-- *foo* -->'
         )

-    # Note: this is a change in behavior for Python-Markdown, which does *not* match the reference
-    # implementation. However, it does match the HTML5 spec. Declarations must start with either
-    # `<!DOCTYPE` or `<![`. Anything else that starts with `<!` is a comment. According to the
-    # HTML5 spec, a comment without the hyphens is a "bogus comment", but a comment nonetheless.
-    # See https://www.w3.org/TR/html52/syntax.html#markup-declaration-open-state.
-    # If we wanted to change this behavior, we could override `HTMLParser.parse_bogus_comment()`.
     def test_bogus_comment(self):
         self.assertMarkdownRenders(
-            '<!*foo*>',
-            '<!--*foo*-->'
+            '<!invalid>',
+            '<p>&lt;!invalid&gt;</p>'
         )

     def test_raw_multiline_comment(self):
@@ -1624,3 +1618,90 @@ class TestHTMLBlocks(TestCase):
         placeholder = md.htmlStash.get_placeholder(md.htmlStash.html_counter + 1)
         result = md.postprocessors['raw_html'].run(placeholder)
         self.assertEqual(placeholder, result)
+
+    def test_bogus_comment_endtag(self):
+        self.assertMarkdownRenders(
+            '</#invalid>',
+            '<p>&lt;/#invalid&gt;</p>'
+        )
+
+    def test_issue_1590(self):
+        """Test case with comments in table for issue #1590."""
+
+        self.assertMarkdownRenders(
+            self.dedent(
+                '''
+                <table>
+                <!--[if mso]>-->
+                <td>foo</td>
+                <!--<!endif]-->
+                <td>bar</td>
+                </table>
+                '''
+            ),
+            self.dedent(
+                '''
+                <table>
+                <!--[if mso]>-->
+                <td>foo</td>
+                <!--<!endif]-->
+                <td>bar</td>
+                </table>
+                '''
+            )
+        )
+
+    def test_stress_comment_handling(self):
+        """Stress test the comment handling."""
+
+        self.assertMarkdownRenders(
+            self.dedent(
+                '''
+                `</` <!-- `<!--[if mso]>` and <!-- </> and `<!--[if mso]>`
+
+                <!-- and <!-- `<!--[if mso]>` and </> `</` and `<!--[if mso]>`
+
+                <!-- Real comment -->
+
+                `<!--[if mso]>` `</` `<!--[if mso]>` and </> <!-- and <!--
+
+                </> `<!--[if mso]>` `</` <!--  and <!--  and `<!--[if mso]>`
+                '''
+            ),
+            self.dedent(
+                '''
+                <p><code>&lt;/</code> &lt;!-- <code>&lt;!--[if mso]&gt;</code> and &lt;!-- &lt;/&gt; and <code>&lt;!--[if mso]&gt;</code></p>
+                <p>&lt;!-- and &lt;!-- <code>&lt;!--[if mso]&gt;</code> and &lt;/&gt; <code>&lt;/</code> and <code>&lt;!--[if mso]&gt;</code></p>
+                <!-- Real comment -->
+                <p><code>&lt;!--[if mso]&gt;</code> <code>&lt;/</code> <code>&lt;!--[if mso]&gt;</code> and &lt;/&gt; &lt;!-- and &lt;!--</p>
+                <p>&lt;/&gt; <code>&lt;!--[if mso]&gt;</code> <code>&lt;/</code> &lt;!--  and &lt;!--  and <code>&lt;!--[if mso]&gt;</code></p>
+                '''  # noqa: E501
+            )
+        )
+
+    def test_unclosed_endtag(self):
+        """Ensure unclosed end tag does not have side effects."""
+
+        self.assertMarkdownRenders(
+            self.dedent(
+                '''
+                `</`
+
+                <div>
+                <!--[if mso]>-->
+                <p>foo</p>
+                <!--<!endif]-->
+                </div>
+                '''
+            ),
+            self.dedent(
+                '''
+                <p><code>&lt;/</code></p>
+                <div>
+                <!--[if mso]>-->
+                <p>foo</p>
+                <!--<!endif]-->
+                </div>
+                '''
+            )
+        )
--
2.54.0


From 719b1921b528706c87218d6324c565b4627bbe4f Mon Sep 17 00:00:00 2001
From: David King <dking@redhat.com>
Date: Mon, 11 May 2026 08:03:13 +0100
Subject: [PATCH 4/4] Extra fix for failing tests

---
 markdown/htmlparser.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py
index 08f5a32..bb64f40 100644
--- a/markdown/htmlparser.py
+++ b/markdown/htmlparser.py
@@ -70,6 +70,23 @@ htmlparser.locatestarttagend_tolerant = re.compile(r"""
    )?
   \s*                                 # trailing whitespace
 """, re.VERBOSE)
+# Monkeypatch `locatetagend` if it exists (Python 3.14+) to also exclude backticks.
+# `check_for_whole_start_tag` uses `locatetagend` instead of `locatestarttagend_tolerant` on 3.14+.
+if hasattr(htmlparser, 'locatetagend'):
+    htmlparser.locatetagend = re.compile(r"""
+      [a-zA-Z][^`\t\n\r\f />]*           # tag name <= added backtick
+      [\t\n\r\f /]*                       # optional whitespace before attribute name
+      (?:(?<=['"\t\n\r\f /])[^`\t\n\r\f />][^`\t\n\r\f /=>]*  # attribute name <= added backtick
+        (?:[\t\n\r\f ]*=[\t\n\r\f ]*      # value indicator
+          (?:'[^']*'                      # LITA-enclosed value
+            |"[^"]*"                      # LIT-enclosed value
+            |(?!['"])[^`>\t\n\r\f ]*      # bare value <= added backtick
+           )
+         )?
+        [\t\n\r\f /]*                     # possibly followed by a space
+       )*
+       >?
+    """, re.VERBOSE)

 # Match a blank line at the start of a block of text (two newlines).
 # The newlines may be preceded by additional whitespace.
--
2.54.0