From ddc24b671b51d1390a4eb015100a9cae202b8781 Mon Sep 17 00:00:00 2001
From: Chris Sewell <chrisj_sewell@hotmail.com>
Date: Fri, 2 Jun 2023 08:57:33 +0200
Subject: [PATCH 1/2] =?UTF-8?q?=F0=9F=90=9B=20FIX:=20numeric=20character?=
 =?UTF-8?q?=20reference=20passing?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix issue with incorrect determination of a numeric character reference, and subsequent failure to convert to an integer code.

This also essentially fixes a bug in upstream, see https://github.com/markdown-it/markdown-it/issues/935
---
 markdown_it/common/utils.py             | 74 +++++++------------------
 tests/test_fuzzer.py                    | 14 ++---
 tests/test_port/fixtures/issue-fixes.md |  9 +++
 3 files changed, 36 insertions(+), 61 deletions(-)

diff --git a/markdown_it/common/utils.py b/markdown_it/common/utils.py
index 4effc00f..8a98b42c 100644
--- a/markdown_it/common/utils.py
+++ b/markdown_it/common/utils.py
@@ -2,7 +2,6 @@
 """
 from __future__ import annotations
 
-import html
 import re
 from typing import Match, TypeVar
 
@@ -52,9 +51,6 @@ def arrayReplaceAt(
     return src[:pos] + newElements + src[pos + 1 :]
 
 
-######################################################################
-
-
 def isValidEntityCode(c: int) -> bool:
     # broken sequence
     if c >= 0xD800 and c <= 0xDFFF:
@@ -89,47 +85,33 @@ def fromCodePoint(c: int) -> str:
     return chr(c)
 
 
-UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])')
+# UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])')
 # ENTITY_RE_g       = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE)
 UNESCAPE_ALL_RE = re.compile(
     r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});",
     re.IGNORECASE,
 )
-DIGITAL_ENTITY_TEST_RE = re.compile(r"^#((?:x[a-f0-9]{1,8}|[0-9]{1,8}))", re.IGNORECASE)
+DIGITAL_ENTITY_BASE10_RE = re.compile(r"#([0-9]{1,8})")
+DIGITAL_ENTITY_BASE16_RE = re.compile(r"#x([a-f0-9]{1,8})", re.IGNORECASE)
 
 
 def replaceEntityPattern(match: str, name: str) -> str:
-    """Convert HTML entity patterns
-
-    ::
-
-        https://www.google.com -> https%3A//www.google.com
-
+    """Convert HTML entity patterns,
+    see https://spec.commonmark.org/0.30/#entity-references
     """
-    code = 0
-
     if name in entities:
         return entities[name]
 
-    if name[0] == "#" and DIGITAL_ENTITY_TEST_RE.search(name):
-        code = int(name[2:], 16) if name[1].lower() == "x" else int(name[1:], 10)
-        if isValidEntityCode(code):
-            return fromCodePoint(code)
-
-    return match
-
-
-# def replaceEntities(string):
-#   if (string.indexOf('&') < 0):
-#       return string
-#   return string.replace(ENTITY_RE, replaceEntityPattern)
+    code: None | int = None
+    if pat := DIGITAL_ENTITY_BASE10_RE.fullmatch(name):
+        code = int(pat.group(1), 10)
+    elif pat := DIGITAL_ENTITY_BASE16_RE.fullmatch(name):
+        code = int(pat.group(1), 16)
 
+    if code is not None and isValidEntityCode(code):
+        return fromCodePoint(code)
 
-def unescapeMd(string: str) -> str:
-    raise NotImplementedError
-    # if "\\" in string:
-    #     return string
-    # return string.replace(UNESCAPE_MD_RE, "$1")
+    return match
 
 
 def unescapeAll(string: str) -> str:
@@ -154,30 +136,14 @@ def stripEscape(string: str) -> str:
     return ESCAPE_CHAR.sub(r"\1", string)
 
 
-# //////////////////////////////////////////////////////////////////////////////
-
-# TODO This section changed quite a lot, should re-check
-
-# UNESCAPE_HTML_RE = re.compile(r"\\&(?=(amp\;|lt\;|gt\;|quot\;))")
-# ESCAPE_AND_HTML = re.compile(r"&(?!(amp\;|lt\;|gt\;|quot\;))")
-# HTML_ESCAPE_REPLACE_RE = re.compile(r'[&<>"]')
-
-
-# def escapeHtml(string: str):
-
-#     if HTML_ESCAPE_REPLACE_RE.search(string):
-
-#         string = UNESCAPE_HTML_RE.sub("&", string)
-#         string = ESCAPE_AND_HTML.sub("&amp;", string)
-#         for k, v in {"<": "&lt;", ">": "&gt;", '"': "&quot;"}.items():
-#             string = string.replace(k, v)
-
-#     return string
-
-
 def escapeHtml(raw: str) -> str:
-    # return html.escape(html.unescape(raw)).replace("&#x27;", "'")
-    return html.escape(raw).replace("&#x27;", "'")
+    """Replace special characters "&", "<", ">" and '"' to HTML-safe sequences."""
+    # like html.escape, but without escaping single quotes
+    raw = raw.replace("&", "&amp;") # Must be done first!
+    raw = raw.replace("<", "&lt;")
+    raw = raw.replace(">", "&gt;")
+    raw = raw.replace('"', "&quot;")
+    return raw
 
 
 # //////////////////////////////////////////////////////////////////////////////
diff --git a/tests/test_fuzzer.py b/tests/test_fuzzer.py
index 60cdddaa..f3666cc5 100644
--- a/tests/test_fuzzer.py
+++ b/tests/test_fuzzer.py
@@ -10,15 +10,15 @@
 from markdown_it import MarkdownIt
 
 TESTS = {
-    55363: ">```\n>",
-    55367: ">-\n>\n>",
-    # 55371: "[](so»0;!"  TODO this did not fail
-    # 55401: "?c_" * 100_000  TODO this did not fail
+    55363: (">```\n>", "<blockquote>\n<pre><code></code></pre>\n</blockquote>\n"),
+    55367: (">-\n>\n>", "<blockquote>\n<ul>\n<li></li>\n</ul>\n</blockquote>\n"),
+    55371: ("[](soH0;!", "<p>[](so&amp;#4H0;!</p>\n"),
+    # 55401: (("?c_" * 100000) + "c_", ""),  TODO this does not fail, just takes a long time
 }
 
 
-@pytest.mark.parametrize("raw_input", TESTS.values(), ids=TESTS.keys())
-def test_fuzzing(raw_input):
+@pytest.mark.parametrize("raw_input,expected", TESTS.values(), ids=TESTS.keys())
+def test_fuzzing(raw_input, expected):
     md = MarkdownIt()
     md.parse(raw_input)
-    print(md.render(raw_input))
+    assert md.render(raw_input) == expected
diff --git a/tests/test_port/fixtures/issue-fixes.md b/tests/test_port/fixtures/issue-fixes.md
index 319945af..b630fcee 100644
--- a/tests/test_port/fixtures/issue-fixes.md
+++ b/tests/test_port/fixtures/issue-fixes.md
@@ -45,3 +45,12 @@ Fix CVE-2023-26303
 <p><img src="%5B" alt="
 " /></p>
 .
+
+Fix parsing of incorrect numeric character references
+.
+[](&#X22y;) &#X22y;
+[](&#35y;) &#35y;
+.
+<p><a href="&amp;#X22y;"></a> &amp;#X22y;
+<a href="&amp;#35y;"></a> &amp;#35y;</p>
+.

From 3f927a53d3376322182327b577b351fcaeaed5b2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 2 Jun 2023 07:03:29 +0000
Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 markdown_it/common/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/markdown_it/common/utils.py b/markdown_it/common/utils.py
index 8a98b42c..6bf9a36f 100644
--- a/markdown_it/common/utils.py
+++ b/markdown_it/common/utils.py
@@ -139,7 +139,7 @@ def stripEscape(string: str) -> str:
 def escapeHtml(raw: str) -> str:
     """Replace special characters "&", "<", ">" and '"' to HTML-safe sequences."""
     # like html.escape, but without escaping single quotes
-    raw = raw.replace("&", "&amp;") # Must be done first!
+    raw = raw.replace("&", "&amp;")  # Must be done first!
     raw = raw.replace("<", "&lt;")
     raw = raw.replace(">", "&gt;")
     raw = raw.replace('"', "&quot;")