Skip to content

Commit ba96f34

Browse files
♻️ Refactor backslash escape logic (#276)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 500e69e commit ba96f34

File tree

13 files changed

+173
-67
lines changed

13 files changed

+173
-67
lines changed

markdown_it/parser_core.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,15 @@
77
from __future__ import annotations
88

99
from .ruler import RuleFunc, Ruler
10-
from .rules_core import block, inline, linkify, normalize, replace, smartquotes
10+
from .rules_core import (
11+
block,
12+
inline,
13+
linkify,
14+
normalize,
15+
replace,
16+
smartquotes,
17+
text_join,
18+
)
1119
from .rules_core.state_core import StateCore
1220

1321
_rules: list[tuple[str, RuleFunc]] = [
@@ -17,6 +25,7 @@
1725
("linkify", linkify),
1826
("replacements", replace),
1927
("smartquotes", smartquotes),
28+
("text_join", text_join),
2029
]
2130

2231

markdown_it/parser_inline.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,18 @@
2828
("entity", rules_inline.entity),
2929
]
3030

31+
# Note `rule2` ruleset was created specifically for emphasis/strikethrough
32+
# post-processing and may be changed in the future.
33+
#
34+
# Don't use this for anything except pairs (plugins working with `balance_pairs`).
35+
#
3136
_rules2: list[tuple[str, RuleFunc]] = [
3237
("balance_pairs", rules_inline.link_pairs),
3338
("strikethrough", rules_inline.strikethrough.postProcess),
3439
("emphasis", rules_inline.emphasis.postProcess),
35-
("text_collapse", rules_inline.text_collapse),
40+
# rules for pairs separate '**' into its own text tokens, which may be left unused,
41+
# rule below merges unused segments back with the rest of the text
42+
("fragments_join", rules_inline.fragments_join),
3643
]
3744

3845

markdown_it/presets/commonmark.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def make() -> PresetType:
4040
"highlight": None,
4141
},
4242
"components": {
43-
"core": {"rules": ["normalize", "block", "inline"]},
43+
"core": {"rules": ["normalize", "block", "inline", "text_join"]},
4444
"block": {
4545
"rules": [
4646
"blockquote",
@@ -68,7 +68,7 @@ def make() -> PresetType:
6868
"newline",
6969
"text",
7070
],
71-
"rules2": ["balance_pairs", "emphasis", "text_collapse"],
71+
"rules2": ["balance_pairs", "emphasis", "fragments_join"],
7272
},
7373
},
7474
}

markdown_it/presets/zero.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,11 @@ def make() -> PresetType:
3333
"highlight": None,
3434
},
3535
"components": {
36-
"core": {"rules": ["normalize", "block", "inline"]},
36+
"core": {"rules": ["normalize", "block", "inline", "text_join"]},
3737
"block": {"rules": ["paragraph"]},
38-
"inline": {"rules": ["text"], "rules2": ["balance_pairs", "text_collapse"]},
38+
"inline": {
39+
"rules": ["text"],
40+
"rules2": ["balance_pairs", "fragments_join"],
41+
},
3942
},
4043
}

markdown_it/rules_core/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
"replace",
77
"smartquotes",
88
"linkify",
9+
"text_join",
910
)
1011

1112
from .block import block
@@ -15,3 +16,4 @@
1516
from .replacements import replace
1617
from .smartquotes import smartquotes
1718
from .state_core import StateCore
19+
from .text_join import text_join

markdown_it/rules_core/text_join.py

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
"""Join raw text tokens with the rest of the text
2+
3+
This is set as a separate rule to provide an opportunity for plugins
4+
to run text replacements after text join, but before escape join.
5+
6+
For example, `\\:)` shouldn't be replaced with an emoji.
7+
"""
8+
from __future__ import annotations
9+
10+
from ..token import Token
11+
from .state_core import StateCore
12+
13+
14+
def text_join(state: StateCore) -> None:
15+
"""Join raw text for escape sequences (`text_special`) tokens with the rest of the text"""
16+
17+
for inline_token in state.tokens[:]:
18+
if inline_token.type != "inline":
19+
continue
20+
21+
# convert text_special to text and join all adjacent text nodes
22+
new_tokens: list[Token] = []
23+
for child_token in inline_token.children or []:
24+
if child_token.type == "text_special":
25+
child_token.type = "text"
26+
if (
27+
child_token.type == "text"
28+
and new_tokens
29+
and new_tokens[-1].type == "text"
30+
):
31+
new_tokens[-1].content += child_token.content
32+
else:
33+
new_tokens.append(child_token)
34+
inline_token.children = new_tokens

markdown_it/rules_inline/__init__.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
__all__ = (
22
"StateInline",
33
"text",
4-
"text_collapse",
4+
"fragments_join",
55
"link_pairs",
66
"escape",
77
"newline",
@@ -20,10 +20,10 @@
2020
from .balance_pairs import link_pairs
2121
from .entity import entity
2222
from .escape import escape
23+
from .fragments_join import fragments_join
2324
from .html_inline import html_inline
2425
from .image import image
2526
from .link import link
2627
from .newline import newline
2728
from .state_inline import StateInline
2829
from .text import text
29-
from .text_collapse import text_collapse

markdown_it/rules_inline/escape.py

+52-39
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,58 @@
44
from ..common.utils import isStrSpace
55
from .state_inline import StateInline
66

7+
8+
def escape(state: StateInline, silent: bool) -> bool:
9+
"""Process escaped chars and hardbreaks."""
10+
pos = state.pos
11+
maximum = state.posMax
12+
13+
if state.src[pos] != "\\":
14+
return False
15+
16+
pos += 1
17+
18+
# '\' at the end of the inline block
19+
if pos >= maximum:
20+
return False
21+
22+
ch1 = state.src[pos]
23+
ch1_ord = ord(ch1)
24+
if ch1 == "\n":
25+
if not silent:
26+
state.push("hardbreak", "br", 0)
27+
pos += 1
28+
# skip leading whitespaces from next line
29+
while pos < maximum:
30+
ch = state.src[pos]
31+
if not isStrSpace(ch):
32+
break
33+
pos += 1
34+
35+
state.pos = pos
36+
return True
37+
38+
escapedStr = state.src[pos]
39+
40+
if ch1_ord >= 0xD800 and ch1_ord <= 0xDBFF and pos + 1 < maximum:
41+
ch2 = state.src[pos + 1]
42+
ch2_ord = ord(ch2)
43+
if ch2_ord >= 0xDC00 and ch2_ord <= 0xDFFF:
44+
escapedStr += ch2
45+
pos += 1
46+
47+
origStr = "\\" + escapedStr
48+
49+
if not silent:
50+
token = state.push("text_special", "", 0)
51+
token.content = escapedStr if ch1 in _ESCAPED else origStr
52+
token.markup = origStr
53+
token.info = "escape"
54+
55+
state.pos = pos + 1
56+
return True
57+
58+
759
_ESCAPED = {
860
"!",
961
'"',
@@ -38,42 +90,3 @@
3890
"}",
3991
"~",
4092
}
41-
42-
43-
def escape(state: StateInline, silent: bool) -> bool:
44-
pos = state.pos
45-
maximum = state.posMax
46-
47-
if state.src[pos] != "\\":
48-
return False
49-
50-
pos += 1
51-
52-
if pos < maximum:
53-
ch = state.src[pos]
54-
55-
if ch in _ESCAPED:
56-
if not silent:
57-
state.pending += state.src[pos]
58-
state.pos += 2
59-
return True
60-
61-
if ch == "\n":
62-
if not silent:
63-
state.push("hardbreak", "br", 0)
64-
65-
pos += 1
66-
# skip leading whitespaces from next line
67-
while pos < maximum:
68-
ch = state.src[pos]
69-
if not isStrSpace(ch):
70-
break
71-
pos += 1
72-
73-
state.pos = pos
74-
return True
75-
76-
if not silent:
77-
state.pending += "\\"
78-
state.pos += 1
79-
return True

markdown_it/rules_inline/text_collapse.py renamed to markdown_it/rules_inline/fragments_join.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from .state_inline import StateInline
22

33

4-
def text_collapse(state: StateInline) -> None:
4+
def fragments_join(state: StateInline) -> None:
55
"""
66
Clean up tokens after emphasis and strikethrough postprocessing:
77
merge adjacent text nodes into one and re-calculate all token levels

tests/test_api/test_main.py

+18-17
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ def test_get_rules():
1313
"linkify",
1414
"replacements",
1515
"smartquotes",
16+
"text_join",
1617
],
1718
"block": [
1819
"table",
@@ -40,21 +41,21 @@ def test_get_rules():
4041
"html_inline",
4142
"entity",
4243
],
43-
"inline2": ["balance_pairs", "strikethrough", "emphasis", "text_collapse"],
44+
"inline2": ["balance_pairs", "strikethrough", "emphasis", "fragments_join"],
4445
}
4546

4647

4748
def test_load_presets():
4849
md = MarkdownIt("zero")
4950
assert md.get_active_rules() == {
5051
"block": ["paragraph"],
51-
"core": ["normalize", "block", "inline"],
52+
"core": ["normalize", "block", "inline", "text_join"],
5253
"inline": ["text"],
53-
"inline2": ["balance_pairs", "text_collapse"],
54+
"inline2": ["balance_pairs", "fragments_join"],
5455
}
5556
md = MarkdownIt("commonmark")
5657
assert md.get_active_rules() == {
57-
"core": ["normalize", "block", "inline"],
58+
"core": ["normalize", "block", "inline", "text_join"],
5859
"block": [
5960
"code",
6061
"fence",
@@ -79,7 +80,7 @@ def test_load_presets():
7980
"html_inline",
8081
"entity",
8182
],
82-
"inline2": ["balance_pairs", "emphasis", "text_collapse"],
83+
"inline2": ["balance_pairs", "emphasis", "fragments_join"],
8384
}
8485

8586

@@ -94,33 +95,33 @@ def test_enable():
9495
md = MarkdownIt("zero").enable("heading")
9596
assert md.get_active_rules() == {
9697
"block": ["heading", "paragraph"],
97-
"core": ["normalize", "block", "inline"],
98+
"core": ["normalize", "block", "inline", "text_join"],
9899
"inline": ["text"],
99-
"inline2": ["balance_pairs", "text_collapse"],
100+
"inline2": ["balance_pairs", "fragments_join"],
100101
}
101102
md.enable(["backticks", "autolink"])
102103
assert md.get_active_rules() == {
103104
"block": ["heading", "paragraph"],
104-
"core": ["normalize", "block", "inline"],
105+
"core": ["normalize", "block", "inline", "text_join"],
105106
"inline": ["text", "backticks", "autolink"],
106-
"inline2": ["balance_pairs", "text_collapse"],
107+
"inline2": ["balance_pairs", "fragments_join"],
107108
}
108109

109110

110111
def test_disable():
111112
md = MarkdownIt("zero").disable("inline")
112113
assert md.get_active_rules() == {
113114
"block": ["paragraph"],
114-
"core": ["normalize", "block"],
115+
"core": ["normalize", "block", "text_join"],
115116
"inline": ["text"],
116-
"inline2": ["balance_pairs", "text_collapse"],
117+
"inline2": ["balance_pairs", "fragments_join"],
117118
}
118119
md.disable(["text"])
119120
assert md.get_active_rules() == {
120121
"block": ["paragraph"],
121-
"core": ["normalize", "block"],
122+
"core": ["normalize", "block", "text_join"],
122123
"inline": [],
123-
"inline2": ["balance_pairs", "text_collapse"],
124+
"inline2": ["balance_pairs", "fragments_join"],
124125
}
125126

126127

@@ -130,15 +131,15 @@ def test_reset():
130131
md.disable("inline")
131132
assert md.get_active_rules() == {
132133
"block": ["paragraph"],
133-
"core": ["normalize", "block"],
134+
"core": ["normalize", "block", "text_join"],
134135
"inline": ["text"],
135-
"inline2": ["balance_pairs", "text_collapse"],
136+
"inline2": ["balance_pairs", "fragments_join"],
136137
}
137138
assert md.get_active_rules() == {
138139
"block": ["paragraph"],
139-
"core": ["normalize", "block", "inline"],
140+
"core": ["normalize", "block", "inline", "text_join"],
140141
"inline": ["text"],
141-
"inline2": ["balance_pairs", "text_collapse"],
142+
"inline2": ["balance_pairs", "fragments_join"],
142143
}
143144

144145

tests/test_port/fixtures/linkify.md

+8-1
Original file line numberDiff line numberDiff line change
@@ -96,4 +96,11 @@ after
9696
<p>before</p>
9797
<p><a href="http://github.com">github.com</a></p>
9898
<p>after</p>
99-
.
99+
.
100+
101+
Don't match escaped
102+
.
103+
google\.com
104+
.
105+
<p>google.com</p>
106+
.

tests/test_port/fixtures/smartquotes.md

+14-1
Original file line numberDiff line numberDiff line change
@@ -163,4 +163,17 @@ Should parse quotes adjacent to inline html, #677:
163163
.
164164
<p>“test <br>”</p>
165165
<p>“<br> test”</p>
166-
.
166+
.
167+
168+
Should be escapable:
169+
.
170+
"foo"
171+
172+
\"foo"
173+
174+
"foo\"
175+
.
176+
<p>“foo”</p>
177+
<p>&quot;foo&quot;</p>
178+
<p>&quot;foo&quot;</p>
179+
.

0 commit comments

Comments
 (0)