From ddc24b671b51d1390a4eb015100a9cae202b8781 Mon Sep 17 00:00:00 2001 From: Chris Sewell Date: Fri, 2 Jun 2023 08:57:33 +0200 Subject: [PATCH 1/2] =?UTF-8?q?=F0=9F=90=9B=20FIX:=20numeric=20character?= =?UTF-8?q?=20reference=20passing?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix issue with incorrect determination of a numeric character reference, and subsequent failure to convert to an integer code. This also essentially fixes a bug in upstream, see https://github.com/markdown-it/markdown-it/issues/935 --- markdown_it/common/utils.py | 74 +++++++------------------ tests/test_fuzzer.py | 14 ++--- tests/test_port/fixtures/issue-fixes.md | 9 +++ 3 files changed, 36 insertions(+), 61 deletions(-) diff --git a/markdown_it/common/utils.py b/markdown_it/common/utils.py index 4effc00f..8a98b42c 100644 --- a/markdown_it/common/utils.py +++ b/markdown_it/common/utils.py @@ -2,7 +2,6 @@ """ from __future__ import annotations -import html import re from typing import Match, TypeVar @@ -52,9 +51,6 @@ def arrayReplaceAt( return src[:pos] + newElements + src[pos + 1 :] -###################################################################### - - def isValidEntityCode(c: int) -> bool: # broken sequence if c >= 0xD800 and c <= 0xDFFF: @@ -89,47 +85,33 @@ def fromCodePoint(c: int) -> str: return chr(c) -UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])') +# UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])') # ENTITY_RE_g = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE) UNESCAPE_ALL_RE = re.compile( r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});", re.IGNORECASE, ) -DIGITAL_ENTITY_TEST_RE = re.compile(r"^#((?:x[a-f0-9]{1,8}|[0-9]{1,8}))", re.IGNORECASE) +DIGITAL_ENTITY_BASE10_RE = re.compile(r"#([0-9]{1,8})") +DIGITAL_ENTITY_BASE16_RE = re.compile(r"#x([a-f0-9]{1,8})", re.IGNORECASE) def replaceEntityPattern(match: str, name: str) -> str: - """Convert HTML entity patterns - - :: - - https://www.google.com -> https%3A//www.google.com - + """Convert HTML entity patterns, + see https://spec.commonmark.org/0.30/#entity-references """ - code = 0 - if name in entities: return entities[name] - if name[0] == "#" and DIGITAL_ENTITY_TEST_RE.search(name): - code = int(name[2:], 16) if name[1].lower() == "x" else int(name[1:], 10) - if isValidEntityCode(code): - return fromCodePoint(code) - - return match - - -# def replaceEntities(string): -# if (string.indexOf('&') < 0): -# return string -# return string.replace(ENTITY_RE, replaceEntityPattern) + code: None | int = None + if pat := DIGITAL_ENTITY_BASE10_RE.fullmatch(name): + code = int(pat.group(1), 10) + elif pat := DIGITAL_ENTITY_BASE16_RE.fullmatch(name): + code = int(pat.group(1), 16) + if code is not None and isValidEntityCode(code): + return fromCodePoint(code) -def unescapeMd(string: str) -> str: - raise NotImplementedError - # if "\\" in string: - # return string - # return string.replace(UNESCAPE_MD_RE, "$1") + return match def unescapeAll(string: str) -> str: @@ -154,30 +136,14 @@ def stripEscape(string: str) -> str: return ESCAPE_CHAR.sub(r"\1", string) -# ////////////////////////////////////////////////////////////////////////////// - -# TODO This section changed quite a lot, should re-check - -# UNESCAPE_HTML_RE = re.compile(r"\\&(?=(amp\;|lt\;|gt\;|quot\;))") -# ESCAPE_AND_HTML = re.compile(r"&(?!(amp\;|lt\;|gt\;|quot\;))") -# HTML_ESCAPE_REPLACE_RE = re.compile(r'[&<>"]') - - -# def escapeHtml(string: str): - -# if HTML_ESCAPE_REPLACE_RE.search(string): - -# string = UNESCAPE_HTML_RE.sub("&", string) -# string = ESCAPE_AND_HTML.sub("&", string) -# for k, v in {"<": "<", ">": ">", '"': """}.items(): -# string = string.replace(k, v) - -# return string - - def escapeHtml(raw: str) -> str: - # return html.escape(html.unescape(raw)).replace("'", "'") - return html.escape(raw).replace("'", "'") + """Replace special characters "&", "<", ">" and '"' to HTML-safe sequences.""" + # like html.escape, but without escaping single quotes + raw = raw.replace("&", "&") # Must be done first! + raw = raw.replace("<", "<") + raw = raw.replace(">", ">") + raw = raw.replace('"', """) + return raw # ////////////////////////////////////////////////////////////////////////////// diff --git a/tests/test_fuzzer.py b/tests/test_fuzzer.py index 60cdddaa..f3666cc5 100644 --- a/tests/test_fuzzer.py +++ b/tests/test_fuzzer.py @@ -10,15 +10,15 @@ from markdown_it import MarkdownIt TESTS = { - 55363: ">```\n>", - 55367: ">-\n>\n>", - # 55371: "[](soยป0;!" TODO this did not fail - # 55401: "?c_" * 100_000 TODO this did not fail + 55363: (">```\n>", "
\n
\n
\n"), + 55367: (">-\n>\n>", "
\n\n
\n"), + 55371: ("[](soH0;!", "

[](so&#4H0;!

\n"), + # 55401: (("?c_" * 100000) + "c_", ""), TODO this does not fail, just takes a long time } -@pytest.mark.parametrize("raw_input", TESTS.values(), ids=TESTS.keys()) -def test_fuzzing(raw_input): +@pytest.mark.parametrize("raw_input,expected", TESTS.values(), ids=TESTS.keys()) +def test_fuzzing(raw_input, expected): md = MarkdownIt() md.parse(raw_input) - print(md.render(raw_input)) + assert md.render(raw_input) == expected diff --git a/tests/test_port/fixtures/issue-fixes.md b/tests/test_port/fixtures/issue-fixes.md index 319945af..b630fcee 100644 --- a/tests/test_port/fixtures/issue-fixes.md +++ b/tests/test_port/fixtures/issue-fixes.md @@ -45,3 +45,12 @@ Fix CVE-2023-26303

. + +Fix parsing of incorrect numeric character references +. +[]("y;) "y; +[](#y;) #y; +. +

&#X22y; + &#35y;

+. From 3f927a53d3376322182327b577b351fcaeaed5b2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 2 Jun 2023 07:03:29 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- markdown_it/common/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/markdown_it/common/utils.py b/markdown_it/common/utils.py index 8a98b42c..6bf9a36f 100644 --- a/markdown_it/common/utils.py +++ b/markdown_it/common/utils.py @@ -139,7 +139,7 @@ def stripEscape(string: str) -> str: def escapeHtml(raw: str) -> str: """Replace special characters "&", "<", ">" and '"' to HTML-safe sequences.""" # like html.escape, but without escaping single quotes - raw = raw.replace("&", "&") # Must be done first! + raw = raw.replace("&", "&") # Must be done first! raw = raw.replace("<", "<") raw = raw.replace(">", ">") raw = raw.replace('"', """)