♻️ Refactor backslash escape logic (#276)

chrisjsewell · pre-commit-ci[bot] · web-flow · commit ba96f34dc14c · 2023-06-02T15:18:38.000+02:00
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
diff --git a/markdown_it/parser_core.py b/markdown_it/parser_core.py
@@ -7,7 +7,15 @@
 from __future__ import annotations
 
 from .ruler import RuleFunc, Ruler
-from .rules_core import block, inline, linkify, normalize, replace, smartquotes
+from .rules_core import (
+    block,
+    inline,
+    linkify,
+    normalize,
+    replace,
+    smartquotes,
+    text_join,
+)
 from .rules_core.state_core import StateCore
 
 _rules: list[tuple[str, RuleFunc]] = [
@@ -17,6 +25,7 @@
     ("linkify", linkify),
     ("replacements", replace),
     ("smartquotes", smartquotes),
+    ("text_join", text_join),
 ]
 
 
diff --git a/markdown_it/parser_inline.py b/markdown_it/parser_inline.py
@@ -28,11 +28,18 @@
     ("entity", rules_inline.entity),
 ]
 
+# Note `rule2` ruleset was created specifically for emphasis/strikethrough
+# post-processing and may be changed in the future.
+#
+# Don't use this for anything except pairs (plugins working with `balance_pairs`).
+#
 _rules2: list[tuple[str, RuleFunc]] = [
     ("balance_pairs", rules_inline.link_pairs),
     ("strikethrough", rules_inline.strikethrough.postProcess),
     ("emphasis", rules_inline.emphasis.postProcess),
-    ("text_collapse", rules_inline.text_collapse),
+    # rules for pairs separate '**' into its own text tokens, which may be left unused,
+    # rule below merges unused segments back with the rest of the text
+    ("fragments_join", rules_inline.fragments_join),
 ]
 
 
diff --git a/markdown_it/presets/commonmark.py b/markdown_it/presets/commonmark.py
@@ -40,7 +40,7 @@ def make() -> PresetType:
             "highlight": None,
         },
         "components": {
-            "core": {"rules": ["normalize", "block", "inline"]},
+            "core": {"rules": ["normalize", "block", "inline", "text_join"]},
             "block": {
                 "rules": [
                     "blockquote",
@@ -68,7 +68,7 @@ def make() -> PresetType:
                     "newline",
                     "text",
                 ],
-                "rules2": ["balance_pairs", "emphasis", "text_collapse"],
+                "rules2": ["balance_pairs", "emphasis", "fragments_join"],
             },
         },
     }
diff --git a/markdown_it/presets/zero.py b/markdown_it/presets/zero.py
@@ -33,8 +33,11 @@ def make() -> PresetType:
             "highlight": None,
         },
         "components": {
-            "core": {"rules": ["normalize", "block", "inline"]},
+            "core": {"rules": ["normalize", "block", "inline", "text_join"]},
             "block": {"rules": ["paragraph"]},
-            "inline": {"rules": ["text"], "rules2": ["balance_pairs", "text_collapse"]},
+            "inline": {
+                "rules": ["text"],
+                "rules2": ["balance_pairs", "fragments_join"],
+            },
         },
     }
diff --git a/markdown_it/rules_core/__init__.py b/markdown_it/rules_core/__init__.py
@@ -6,6 +6,7 @@
     "replace",
     "smartquotes",
     "linkify",
+    "text_join",
 )
 
 from .block import block
@@ -15,3 +16,4 @@
 from .replacements import replace
 from .smartquotes import smartquotes
 from .state_core import StateCore
+from .text_join import text_join
diff --git a/markdown_it/rules_core/text_join.py b/markdown_it/rules_core/text_join.py
@@ -0,0 +1,34 @@
+"""Join raw text tokens with the rest of the text
+
+This is set as a separate rule to provide an opportunity for plugins
+to run text replacements after text join, but before escape join.
+
+For example, `\\:)` shouldn't be replaced with an emoji.
+"""
+from __future__ import annotations
+
+from ..token import Token
+from .state_core import StateCore
+
+
+def text_join(state: StateCore) -> None:
+    """Join raw text for escape sequences (`text_special`) tokens with the rest of the text"""
+
+    for inline_token in state.tokens[:]:
+        if inline_token.type != "inline":
+            continue
+
+        # convert text_special to text and join all adjacent text nodes
+        new_tokens: list[Token] = []
+        for child_token in inline_token.children or []:
+            if child_token.type == "text_special":
+                child_token.type = "text"
+            if (
+                child_token.type == "text"
+                and new_tokens
+                and new_tokens[-1].type == "text"
+            ):
+                new_tokens[-1].content += child_token.content
+            else:
+                new_tokens.append(child_token)
+        inline_token.children = new_tokens
diff --git a/markdown_it/rules_inline/__init__.py b/markdown_it/rules_inline/__init__.py
@@ -1,7 +1,7 @@
 __all__ = (
     "StateInline",
     "text",
-    "text_collapse",
+    "fragments_join",
     "link_pairs",
     "escape",
     "newline",
@@ -20,10 +20,10 @@
 from .balance_pairs import link_pairs
 from .entity import entity
 from .escape import escape
+from .fragments_join import fragments_join
 from .html_inline import html_inline
 from .image import image
 from .link import link
 from .newline import newline
 from .state_inline import StateInline
 from .text import text
-from .text_collapse import text_collapse
diff --git a/markdown_it/rules_inline/escape.py b/markdown_it/rules_inline/escape.py
@@ -4,6 +4,58 @@
 from ..common.utils import isStrSpace
 from .state_inline import StateInline
 
+
+def escape(state: StateInline, silent: bool) -> bool:
+    """Process escaped chars and hardbreaks."""
+    pos = state.pos
+    maximum = state.posMax
+
+    if state.src[pos] != "\\":
+        return False
+
+    pos += 1
+
+    # '\' at the end of the inline block
+    if pos >= maximum:
+        return False
+
+    ch1 = state.src[pos]
+    ch1_ord = ord(ch1)
+    if ch1 == "\n":
+        if not silent:
+            state.push("hardbreak", "br", 0)
+        pos += 1
+        # skip leading whitespaces from next line
+        while pos < maximum:
+            ch = state.src[pos]
+            if not isStrSpace(ch):
+                break
+            pos += 1
+
+        state.pos = pos
+        return True
+
+    escapedStr = state.src[pos]
+
+    if ch1_ord >= 0xD800 and ch1_ord <= 0xDBFF and pos + 1 < maximum:
+        ch2 = state.src[pos + 1]
+        ch2_ord = ord(ch2)
+        if ch2_ord >= 0xDC00 and ch2_ord <= 0xDFFF:
+            escapedStr += ch2
+            pos += 1
+
+    origStr = "\\" + escapedStr
+
+    if not silent:
+        token = state.push("text_special", "", 0)
+        token.content = escapedStr if ch1 in _ESCAPED else origStr
+        token.markup = origStr
+        token.info = "escape"
+
+    state.pos = pos + 1
+    return True
+
+
 _ESCAPED = {
     "!",
     '"',
@@ -38,42 +90,3 @@
     "}",
     "~",
 }
-
-
-def escape(state: StateInline, silent: bool) -> bool:
-    pos = state.pos
-    maximum = state.posMax
-
-    if state.src[pos] != "\\":
-        return False
-
-    pos += 1
-
-    if pos < maximum:
-        ch = state.src[pos]
-
-        if ch in _ESCAPED:
-            if not silent:
-                state.pending += state.src[pos]
-            state.pos += 2
-            return True
-
-        if ch == "\n":
-            if not silent:
-                state.push("hardbreak", "br", 0)
-
-            pos += 1
-            # skip leading whitespaces from next line
-            while pos < maximum:
-                ch = state.src[pos]
-                if not isStrSpace(ch):
-                    break
-                pos += 1
-
-            state.pos = pos
-            return True
-
-    if not silent:
-        state.pending += "\\"
-    state.pos += 1
-    return True
diff --git a/markdown_it/rules_inline/fragments_join.py b/markdown_it/rules_inline/fragments_join.py
@@ -1,7 +1,7 @@
 from .state_inline import StateInline
 
 
-def text_collapse(state: StateInline) -> None:
+def fragments_join(state: StateInline) -> None:
     """
     Clean up tokens after emphasis and strikethrough postprocessing:
     merge adjacent text nodes into one and re-calculate all token levels
diff --git a/tests/test_api/test_main.py b/tests/test_api/test_main.py
@@ -13,6 +13,7 @@ def test_get_rules():
             "linkify",
             "replacements",
             "smartquotes",
+            "text_join",
         ],
         "block": [
             "table",
@@ -40,21 +41,21 @@ def test_get_rules():
             "html_inline",
             "entity",
         ],
-        "inline2": ["balance_pairs", "strikethrough", "emphasis", "text_collapse"],
+        "inline2": ["balance_pairs", "strikethrough", "emphasis", "fragments_join"],
     }
 
 
 def test_load_presets():
     md = MarkdownIt("zero")
     assert md.get_active_rules() == {
         "block": ["paragraph"],
-        "core": ["normalize", "block", "inline"],
+        "core": ["normalize", "block", "inline", "text_join"],
         "inline": ["text"],
-        "inline2": ["balance_pairs", "text_collapse"],
+        "inline2": ["balance_pairs", "fragments_join"],
     }
     md = MarkdownIt("commonmark")
     assert md.get_active_rules() == {
-        "core": ["normalize", "block", "inline"],
+        "core": ["normalize", "block", "inline", "text_join"],
         "block": [
             "code",
             "fence",
@@ -79,7 +80,7 @@ def test_load_presets():
             "html_inline",
             "entity",
         ],
-        "inline2": ["balance_pairs", "emphasis", "text_collapse"],
+        "inline2": ["balance_pairs", "emphasis", "fragments_join"],
     }
 
 
@@ -94,33 +95,33 @@ def test_enable():
     md = MarkdownIt("zero").enable("heading")
     assert md.get_active_rules() == {
         "block": ["heading", "paragraph"],
-        "core": ["normalize", "block", "inline"],
+        "core": ["normalize", "block", "inline", "text_join"],
         "inline": ["text"],
-        "inline2": ["balance_pairs", "text_collapse"],
+        "inline2": ["balance_pairs", "fragments_join"],
     }
     md.enable(["backticks", "autolink"])
     assert md.get_active_rules() == {
         "block": ["heading", "paragraph"],
-        "core": ["normalize", "block", "inline"],
+        "core": ["normalize", "block", "inline", "text_join"],
         "inline": ["text", "backticks", "autolink"],
-        "inline2": ["balance_pairs", "text_collapse"],
+        "inline2": ["balance_pairs", "fragments_join"],
     }
 
 
 def test_disable():
     md = MarkdownIt("zero").disable("inline")
     assert md.get_active_rules() == {
         "block": ["paragraph"],
-        "core": ["normalize", "block"],
+        "core": ["normalize", "block", "text_join"],
         "inline": ["text"],
-        "inline2": ["balance_pairs", "text_collapse"],
+        "inline2": ["balance_pairs", "fragments_join"],
     }
     md.disable(["text"])
     assert md.get_active_rules() == {
         "block": ["paragraph"],
-        "core": ["normalize", "block"],
+        "core": ["normalize", "block", "text_join"],
         "inline": [],
-        "inline2": ["balance_pairs", "text_collapse"],
+        "inline2": ["balance_pairs", "fragments_join"],
     }
 
 
@@ -130,15 +131,15 @@ def test_reset():
         md.disable("inline")
         assert md.get_active_rules() == {
             "block": ["paragraph"],
-            "core": ["normalize", "block"],
+            "core": ["normalize", "block", "text_join"],
             "inline": ["text"],
-            "inline2": ["balance_pairs", "text_collapse"],
+            "inline2": ["balance_pairs", "fragments_join"],
         }
     assert md.get_active_rules() == {
         "block": ["paragraph"],
-        "core": ["normalize", "block", "inline"],
+        "core": ["normalize", "block", "inline", "text_join"],
         "inline": ["text"],
-        "inline2": ["balance_pairs", "text_collapse"],
+        "inline2": ["balance_pairs", "fragments_join"],
     }
 
 
diff --git a/tests/test_port/fixtures/linkify.md b/tests/test_port/fixtures/linkify.md
@@ -96,4 +96,11 @@ after
 <p>before</p>
 <p><a href="http://github.com">github.com</a></p>
 <p>after</p>
-.
+.
+
+Don't match escaped
+.
+google\.com
+.
+<p>google.com</p>
+.
diff --git a/tests/test_port/fixtures/smartquotes.md b/tests/test_port/fixtures/smartquotes.md
@@ -163,4 +163,17 @@ Should parse quotes adjacent to inline html, #677:
 .
 <p>“test <br>”</p>
 <p>“<br> test”</p>
-.
+.
+
+Should be escapable:
+.
+"foo"
+
+\"foo"
+
+"foo\"
+.
+<p>“foo”</p>
+<p>&quot;foo&quot;</p>
+<p>&quot;foo&quot;</p>
+.
diff --git a/tests/test_port/fixtures/typographer.md b/tests/test_port/fixtures/typographer.md