From 3191e76a3e6d0b4a89bbf9ec52bbc84aa24e22b8 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Wed, 18 Jun 2025 10:29:03 -0400 Subject: [PATCH 1/4] Ensure incomplete markup declaration in raw HTML doesn't crash parser. See Python bug report at gh-77057 for details. Until we drop support for Python < 3.13 (where this was fixed upstream), we need to avoid the unwanted error by checking for it explicitly. Fixes #1534. --- markdown/extensions/md_in_html.py | 4 ++++ markdown/htmlparser.py | 4 ++++ tests/test_syntax/blocks/test_html_blocks.py | 7 +++++++ 3 files changed, 15 insertions(+) diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index 982d603..99001ca 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -227,6 +227,10 @@ class HTMLExtractorExtra(HTMLExtractor): def parse_html_declaration(self, i): if self.at_line_start() or self.intail or self.mdstack: + if self.rawdata[i:i+3] == ' int: if self.at_line_start() or self.intail: + if self.rawdata[i:i+3] == '<![

' + ) + def test_raw_cdata_code_span(self): self.assertMarkdownRenders( self.dedent( -- 2.54.0 From 1a38a252e759b6a7d86710640af2b7157feff51c Mon Sep 17 00:00:00 2001 From: Isaac Muse Date: Thu, 19 Jun 2025 09:46:13 -0600 Subject: [PATCH 2/4] Fixes for Python 3.14 - Fix codecs deprecation - Fix issue with unclosed `': + self.handle_data('<') + self.override_comment_update = True + return self.handle_empty_tag(''.format(data), is_block=True) + def updatepos(self, i: int, j: int) -> int: + if self.override_comment_update: + self.override_comment_update = False + i = 0 + j = 1 + return super().updatepos(i, j) + def handle_decl(self, data: str): self.handle_empty_tag(''.format(data), is_block=True) @@ -270,7 +285,11 @@ class HTMLExtractor(htmlparser.HTMLParser): if self.rawdata[i:i+3] == ' Date: Tue, 28 Apr 2026 16:57:49 +0100 Subject: [PATCH 3/4] Backport upstream 3.10.1 to 3.10.2 fixes --- markdown/htmlparser.py | 74 ++++++++++++--- markdown/inlinepatterns.py | 8 +- tests/test_syntax/blocks/test_html_blocks.py | 97 ++++++++++++++++++-- 3 files changed, 155 insertions(+), 24 deletions(-) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index a4dd42d..08f5a32 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -30,6 +30,9 @@ import importlib.util import sys +# Included for versions which do not have current comment fix +commentclose = re.compile(r'--!?>') + # Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it. # Users can still do `from html import parser` and get the default behavior. spec = importlib.util.find_spec('html.parser') @@ -37,6 +40,12 @@ htmlparser = importlib.util.module_from_spec(spec) spec.loader.exec_module(htmlparser) sys.modules['htmlparser'] = htmlparser +# This is a hack. We are sneaking in `` so we can capture it without the HTML parser +# throwing it away. When we see it, we will process it as data. +htmlparser.starttagopen = re.compile('<[a-zA-Z]|') + +htmlparser.endtagopen = re.compile('` to close Processing Instructions. htmlparser.piclose = re.compile(r'\?>') # Monkeypatch `HTMLParser` to only recognize entity references with a closing semicolon. @@ -67,6 +76,30 @@ htmlparser.locatestarttagend_tolerant = re.compile(r""" blank_line_re = re.compile(r'^([ ]*\n){2}') +class _HTMLParser(htmlparser.HTMLParser): + """Handle special start and end tags.""" + + def parse_endtag(self, i): + start = self.rawdata[i:i+3] + c = ord(start[-1]) + if len(start) < 3 or not (65 <= c <= 90 or 97 <= c <= 122): + self.handle_data(self.rawdata[i:i + 2]) + return i + 2 + return super().parse_endtag(i) + + def parse_starttag(self, i): # pragma: no cover + # Treat `` as normal data as it is not a real tag. + if self.rawdata[i:i + 3] == '': + self.handle_data(self.rawdata[i:i + 3]) + return i + 3 + + return super().parse_starttag(i) + + +# Overwrite our custom one for people like MkDocs that pull it in +htmlparser.HTMLParser = _HTMLParser + + class HTMLExtractor(htmlparser.HTMLParser): """ Extract raw HTML from text. @@ -85,8 +118,6 @@ class HTMLExtractor(htmlparser.HTMLParser): self.lineno_start_cache = [0] - self.override_comment_update = False - # This calls self.reset super().__init__(*args, **kwargs) self.md = md @@ -247,21 +278,8 @@ class HTMLExtractor(htmlparser.HTMLParser): self.handle_empty_tag('&{};'.format(name), is_block=False) def handle_comment(self, data: str): - # Check if the comment is unclosed, if so, we need to override position - i = self.line_offset + self.offset + len(data) + 4 - if self.rawdata[i:i + 3] != '-->': - self.handle_data('<') - self.override_comment_update = True - return self.handle_empty_tag(''.format(data), is_block=True) - def updatepos(self, i: int, j: int) -> int: - if self.override_comment_update: - self.override_comment_update = False - i = 0 - j = 1 - return super().updatepos(i, j) - def handle_decl(self, data: str): self.handle_empty_tag(''.format(data), is_block=True) @@ -280,6 +298,18 @@ class HTMLExtractor(htmlparser.HTMLParser): self.handle_data(' int: if self.at_line_start() or self.intail: if self.rawdata[i:i+3] == ' int: # pragma: no cover + # Treat `` as normal data as it is not a real tag. + if self.rawdata[i:i + 3] == '': + self.handle_data(self.rawdata[i:i + 3]) + return i + 3 + self.__starttag_text = None endpos = self.check_for_whole_start_tag(i) if endpos < 0: diff --git a/markdown/inlinepatterns.py b/markdown/inlinepatterns.py index 296ab83..78310fc 100644 --- a/markdown/inlinepatterns.py +++ b/markdown/inlinepatterns.py @@ -161,7 +161,13 @@ AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^<>]*)>' AUTOMAIL_RE = r'<([^<> !]+@[^@<> ]+)>' """ Match an automatic email link (``). """ -HTML_RE = r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|!--(?:(?!).)*--)>)' +HTML_RE = ( + r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|' + r'!--(?:(?!).)*--|' + r'[?](?:(?!<[?]|[?]>).)*[?]|' + r'!\[CDATA\[(?:(?!).)*\]\]' + ')>)' +) """ Match an HTML tag (`<...>`). """ ENTITY_RE = r'(&(?:\#[0-9]+|\#x[0-9a-fA-F]+|[a-zA-Z0-9]+);)' diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index e8c37b3..3b95ec7 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -782,16 +782,10 @@ class TestHTMLBlocks(TestCase): '' ) - # Note: this is a change in behavior for Python-Markdown, which does *not* match the reference - # implementation. However, it does match the HTML5 spec. Declarations must start with either - # `', - '' + '', + '

<!invalid>

' ) def test_raw_multiline_comment(self): @@ -1624,3 +1618,90 @@ class TestHTMLBlocks(TestCase): placeholder = md.htmlStash.get_placeholder(md.htmlStash.html_counter + 1) result = md.postprocessors['raw_html'].run(placeholder) self.assertEqual(placeholder, result) + + def test_bogus_comment_endtag(self): + self.assertMarkdownRenders( + '', + '

</#invalid>

' + ) + + def test_issue_1590(self): + """Test case with comments in table for issue #1590.""" + + self.assertMarkdownRenders( + self.dedent( + ''' + + + + + +
foobar
+ ''' + ), + self.dedent( + ''' + + + + + +
foobar
+ ''' + ) + ) + + def test_stress_comment_handling(self): + """Stress test the comment handling.""" + + self.assertMarkdownRenders( + self.dedent( + ''' + `` and + + ` +

<!--[if mso]> </ <!--[if mso]> and </> <!-- and <!--

+

</> <!--[if mso]> </ <!-- and <!-- and <!--[if mso]>

+ ''' # noqa: E501 + ) + ) + + def test_unclosed_endtag(self): + """Ensure unclosed end tag does not have side effects.""" + + self.assertMarkdownRenders( + self.dedent( + ''' + ` + +

foo

+ + + ''' + ), + self.dedent( + ''' +

</

+
+ +

foo

+ +
+ ''' + ) + ) -- 2.54.0 From 719b1921b528706c87218d6324c565b4627bbe4f Mon Sep 17 00:00:00 2001 From: David King Date: Mon, 11 May 2026 08:03:13 +0100 Subject: [PATCH 4/4] Extra fix for failing tests --- markdown/htmlparser.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index 08f5a32..bb64f40 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -70,6 +70,23 @@ htmlparser.locatestarttagend_tolerant = re.compile(r""" )? \s* # trailing whitespace """, re.VERBOSE) +# Monkeypatch `locatetagend` if it exists (Python 3.14+) to also exclude backticks. +# `check_for_whole_start_tag` uses `locatetagend` instead of `locatestarttagend_tolerant` on 3.14+. +if hasattr(htmlparser, 'locatetagend'): + htmlparser.locatetagend = re.compile(r""" + [a-zA-Z][^`\t\n\r\f />]* # tag name <= added backtick + [\t\n\r\f /]* # optional whitespace before attribute name + (?:(?<=['"\t\n\r\f /])[^`\t\n\r\f />][^`\t\n\r\f /=>]* # attribute name <= added backtick + (?:[\t\n\r\f ]*=[\t\n\r\f ]* # value indicator + (?:'[^']*' # LITA-enclosed value + |"[^"]*" # LIT-enclosed value + |(?!['"])[^`>\t\n\r\f ]* # bare value <= added backtick + ) + )? + [\t\n\r\f /]* # possibly followed by a space + )* + >? + """, re.VERBOSE) # Match a blank line at the start of a block of text (two newlines). # The newlines may be preceded by additional whitespace. -- 2.54.0