From 3cc003e8693e50112189e9cd1b3116aa4c9003c4 Mon Sep 17 00:00:00 2001
From: Waylan Limberg
Date: Wed, 18 Jun 2025 10:29:03 -0400
Subject: [PATCH 1/3] Ensure incomplete markup declaration in raw HTML doesn't
crash parser.
See Python bug report at gh-77057 for details. Until we drop support for
Python < 3.13 (where this was fixed upstream), we need to avoid the
unwanted error by checking for it explicitly. Fixes #1534.
---
markdown/extensions/md_in_html.py | 4 ++++
markdown/htmlparser.py | 4 ++++
tests/test_syntax/blocks/test_html_blocks.py | 7 +++++++
3 files changed, 15 insertions(+)
diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py
index 86cf00d..f95406e 100644
--- a/markdown/extensions/md_in_html.py
+++ b/markdown/extensions/md_in_html.py
@@ -218,6 +218,10 @@ class HTMLExtractorExtra(HTMLExtractor):
def parse_html_declaration(self, i):
if self.at_line_start() or self.intail or self.mdstack:
+ if self.rawdata[i:i+3] == '<![
'
+ )
+
def test_raw_cdata_code_span(self):
self.assertMarkdownRenders(
self.dedent(
--
2.54.0
From 1cc3f662b7bafe526e1a2fe72fff1bb7f784e346 Mon Sep 17 00:00:00 2001
From: Isaac Muse
Date: Thu, 19 Jun 2025 09:46:13 -0600
Subject: [PATCH 2/3] Fixes for Python 3.14
- Fix codecs deprecation
- Fix issue with unclosed `':
+ self.handle_data('<')
+ self.override_comment_update = True
+ return
self.handle_empty_tag(''.format(data), is_block=True)
+ def updatepos(self, i: int, j: int) -> int:
+ if self.override_comment_update:
+ self.override_comment_update = False
+ i = 0
+ j = 1
+ return super().updatepos(i, j)
+
def handle_decl(self, data):
self.handle_empty_tag(''.format(data), is_block=True)
@@ -259,7 +274,11 @@ class HTMLExtractor(htmlparser.HTMLParser):
if self.rawdata[i:i+3] == '
Date: Tue, 28 Apr 2026 16:57:49 +0100
Subject: [PATCH 3/3] Backport upstream 3.10.1 to 3.10.2 fixes
---
markdown/htmlparser.py | 92 +++++++++++++++----
markdown/inlinepatterns.py | 8 +-
tests/test_syntax/blocks/test_html_blocks.py | 97 ++++++++++++++++++--
3 files changed, 172 insertions(+), 25 deletions(-)
diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py
index e9fe1e4..9cdc05f 100644
--- a/markdown/htmlparser.py
+++ b/markdown/htmlparser.py
@@ -24,6 +24,9 @@ import importlib
import sys
+# Included for versions which do not have current comment fix
+commentclose = re.compile(r'--!?>')
+
# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it.
# Users can still do `from html import parser` and get the default behavior.
spec = importlib.util.find_spec('html.parser')
@@ -31,6 +34,12 @@ htmlparser = importlib.util.module_from_spec(spec)
spec.loader.exec_module(htmlparser)
sys.modules['htmlparser'] = htmlparser
+# This is a hack. We are sneaking in `>` so we can capture it without the HTML parser
+# throwing it away. When we see it, we will process it as data.
+htmlparser.starttagopen = re.compile('<[a-zA-Z]|>')
+
+htmlparser.endtagopen = re.compile('[a-zA-Z]?')
+
# Monkeypatch HTMLParser to only accept `?>` to close Processing Instructions.
htmlparser.piclose = re.compile(r'\?>')
# Monkeypatch HTMLParser to only recognize entity references with a closing semicolon.
@@ -55,12 +64,52 @@ htmlparser.locatestarttagend_tolerant = re.compile(r"""
)?
\s* # trailing whitespace
""", re.VERBOSE)
+# Monkeypatch `locatetagend` if it exists (Python 3.14+) to also reject backticks.
+if hasattr(htmlparser, 'locatetagend'):
+ htmlparser.locatetagend = re.compile(r"""
+ [a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here
+ [\t\n\r\f /]* # optional whitespace before attribute name
+ (?:(?<=['"\t\n\r\f /])[^`\t\n\r\f />][^`\t\n\r\f /=>]* # attribute name <= added backtick here
+ (?:[\t\n\r\f ]*=[\t\n\r\f ]* # value indicator
+ (?:'[^']*' # LITA-enclosed value
+ |"[^"]*" # LIT-enclosed value
+ |(?!['"])[^`>\t\n\r\f ]* # bare value <= added backtick here
+ )
+ )?
+ [\t\n\r\f /]* # possibly followed by a space
+ )*
+ >?
+ """, re.VERBOSE)
# Match a blank line at the start of a block of text (two newlines).
# The newlines may be preceded by additional whitespace.
blank_line_re = re.compile(r'^([ ]*\n){2}')
+class _HTMLParser(htmlparser.HTMLParser):
+ """Handle special start and end tags."""
+
+ def parse_endtag(self, i):
+ start = self.rawdata[i:i+3]
+ c = ord(start[-1])
+ if len(start) < 3 or not (65 <= c <= 90 or 97 <= c <= 122):
+ self.handle_data(self.rawdata[i:i + 2])
+ return i + 2
+ return super().parse_endtag(i)
+
+ def parse_starttag(self, i): # pragma: no cover
+ # Treat `>` as normal data as it is not a real tag.
+ if self.rawdata[i:i + 3] == '>':
+ self.handle_data(self.rawdata[i:i + 3])
+ return i + 3
+
+ return super().parse_starttag(i)
+
+
+# Overwrite our custom one for people like MkDocs that pull it in
+htmlparser.HTMLParser = _HTMLParser
+
+
class HTMLExtractor(htmlparser.HTMLParser):
"""
Extract raw HTML from text.
@@ -76,8 +125,6 @@ class HTMLExtractor(htmlparser.HTMLParser):
# Block tags that should contain no content (self closing)
self.empty_tags = set(['hr'])
- self.override_comment_update = False
-
# This calls self.reset
super().__init__(*args, **kwargs)
self.md = md
@@ -235,22 +282,9 @@ class HTMLExtractor(htmlparser.HTMLParser):
def handle_entityref(self, name):
self.handle_empty_tag('&{};'.format(name), is_block=False)
- def handle_comment(self, data: str):
- # Check if the comment is unclosed, if so, we need to override position
- i = self.line_offset + self.offset + len(data) + 4
- if self.rawdata[i:i + 3] != '-->':
- self.handle_data('<')
- self.override_comment_update = True
- return
+ def handle_comment(self, data):
self.handle_empty_tag(''.format(data), is_block=True)
- def updatepos(self, i: int, j: int) -> int:
- if self.override_comment_update:
- self.override_comment_update = False
- i = 0
- j = 1
- return super().updatepos(i, j)
-
def handle_decl(self, data):
self.handle_empty_tag(''.format(data), is_block=True)
@@ -269,6 +303,18 @@ class HTMLExtractor(htmlparser.HTMLParser):
self.handle_data('')
return i + 2
+ def parse_comment(self, i, report=True):
+ rawdata = self.rawdata
+ assert rawdata.startswith(').)*--)>)'
+HTML_RE = (
+ r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|'
+ r'!--(?:(?!).)*--|'
+ r'[?](?:(?!<[?]|[?]>).)*[?]|'
+ r'!\[CDATA\[(?:(?!).)*\]\]'
+ ')>)'
+)
# "&" (decimal) or "&" (hex) or "&" (named)
ENTITY_RE = r'(&(?:\#[0-9]+|\#x[0-9a-fA-F]+|[a-zA-Z0-9]+);)'
diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py
index 76d21af..2a5d5f4 100644
--- a/tests/test_syntax/blocks/test_html_blocks.py
+++ b/tests/test_syntax/blocks/test_html_blocks.py
@@ -782,16 +782,10 @@ class TestHTMLBlocks(TestCase):
''
)
- # Note: this is a change in behavior for Python-Markdown, which does *not* match the reference
- # implementation. However, it does match the HTML5 spec. Declarations must start with either
- # `',
- ''
+ '',
+ '<!invalid>
'
)
def test_raw_multiline_comment(self):
@@ -1624,3 +1618,90 @@ class TestHTMLBlocks(TestCase):
placeholder = md.htmlStash.get_placeholder(md.htmlStash.html_counter + 1)
result = md.postprocessors['raw_html'].run(placeholder)
self.assertEqual(placeholder, result)
+
+ def test_bogus_comment_endtag(self):
+ self.assertMarkdownRenders(
+ '#invalid>',
+ '</#invalid>
'
+ )
+
+ def test_issue_1590(self):
+ """Test case with comments in table for issue #1590."""
+
+ self.assertMarkdownRenders(
+ self.dedent(
+ '''
+
+ '''
+ ),
+ self.dedent(
+ '''
+
+ '''
+ )
+ )
+
+ def test_stress_comment_handling(self):
+ """Stress test the comment handling."""
+
+ self.assertMarkdownRenders(
+ self.dedent(
+ '''
+ ``
+
+ `
+ <!--[if mso]> </ <!--[if mso]> and </> <!-- and <!--
+ </> <!--[if mso]> </ <!-- and <!-- and <!--[if mso]>
+ ''' # noqa: E501
+ )
+ )
+
+ def test_unclosed_endtag(self):
+ """Ensure unclosed end tag does not have side effects."""
+
+ self.assertMarkdownRenders(
+ self.dedent(
+ '''
+ ``
+
+
+ '''
+ ),
+ self.dedent(
+ '''
+ </
+
+ '''
+ )
+ )
--
2.54.0