6
6
7
7
# Original code Copyright 2008 [Jack Miller](https://codezen.org/)
8
8
9
- # All changes Copyright 2008-2014 The Python Markdown Project
9
+ # All changes Copyright 2008-2024 The Python Markdown Project
10
10
11
11
# License: [BSD](https://opensource.org/licenses/bsd-license.php)
12
12
21
21
22
22
from . import Extension
23
23
from ..treeprocessors import Treeprocessor
24
- from ..util import code_escape , parseBoolValue , AMP_SUBSTITUTE , HTML_PLACEHOLDER_RE , AtomicString
24
+ from ..util import parseBoolValue , AMP_SUBSTITUTE , deprecated , HTML_PLACEHOLDER_RE , AtomicString
25
25
from ..treeprocessors import UnescapeTreeprocessor
26
+ from ..serializers import RE_AMP
26
27
import re
27
28
import html
28
29
import unicodedata
30
+ from copy import deepcopy
29
31
import xml .etree .ElementTree as etree
30
32
from typing import TYPE_CHECKING , Any , Iterator , MutableSet
31
33
@@ -63,6 +65,7 @@ def unique(id: str, ids: MutableSet[str]) -> str:
63
65
return id
64
66
65
67
68
+ @deprecated ('Use `render_inner_html` and `striptags` instead.' )
66
69
def get_name (el : etree .Element ) -> str :
67
70
"""Get title name."""
68
71
@@ -75,6 +78,7 @@ def get_name(el: etree.Element) -> str:
75
78
return '' .join (text ).strip ()
76
79
77
80
81
+ @deprecated ('Use `run_postprocessors`, `render_inner_html` and/or `striptags` instead.' )
78
82
def stashedHTML2text (text : str , md : Markdown , strip_entities : bool = True ) -> str :
79
83
""" Extract raw HTML from stash, reduce to plain text and swap with placeholder. """
80
84
def _html_sub (m : re .Match [str ]) -> str :
@@ -93,11 +97,80 @@ def _html_sub(m: re.Match[str]) -> str:
93
97
94
98
95
99
def unescape (text : str ) -> str :
96
- """ Unescape escaped text. """
100
+ """ Unescape Markdown backslash escaped text. """
97
101
c = UnescapeTreeprocessor ()
98
102
return c .unescape (text )
99
103
100
104
105
+ def strip_tags (text : str ) -> str :
106
+ """ Strip HTML tags and return plain text. Note: HTML entities are unaffected. """
107
+ # A comment could contain a tag, so strip comments first
108
+ while (start := text .find ('<!--' )) != - 1 and (end := text .find ('-->' , start )) != - 1 :
109
+ text = f'{ text [:start ]} { text [end + 3 :]} '
110
+
111
+ while (start := text .find ('<' )) != - 1 and (end := text .find ('>' , start )) != - 1 :
112
+ text = f'{ text [:start ]} { text [end + 1 :]} '
113
+
114
+ # Collapse whitespace
115
+ text = ' ' .join (text .split ())
116
+ return text
117
+
118
+
119
+ def escape_cdata (text : str ) -> str :
120
+ """ Escape character data. """
121
+ if "&" in text :
122
+ # Only replace & when not part of an entity
123
+ text = RE_AMP .sub ('&' , text )
124
+ if "<" in text :
125
+ text = text .replace ("<" , "<" )
126
+ if ">" in text :
127
+ text = text .replace (">" , ">" )
128
+ return text
129
+
130
+
131
+ def run_postprocessors (text : str , md : Markdown ) -> str :
132
+ """ Run postprocessors from Markdown instance on text. """
133
+ for pp in md .postprocessors :
134
+ text = pp .run (text )
135
+ return text .strip ()
136
+
137
+
138
+ def render_inner_html (el : etree .Element , md : Markdown ) -> str :
139
+ """ Fully render inner html of an `etree` element as a string. """
140
+ # The `UnescapeTreeprocessor` runs after `toc` extension so run here.
141
+ text = unescape (md .serializer (el ))
142
+
143
+ # strip parent tag
144
+ start = text .index ('>' ) + 1
145
+ end = text .rindex ('<' )
146
+ text = text [start :end ].strip ()
147
+
148
+ return run_postprocessors (text , md )
149
+
150
+
151
+ def remove_fnrefs (root : etree .Element ) -> etree .Element :
152
+ """ Remove footnote references from a copy of the element, if any are present. """
153
+ # Remove footnote references, which look like this: `<sup id="fnref:1">...</sup>`.
154
+ # If there are no `sup` elements, then nothing to do.
155
+ if next (root .iter ('sup' ), None ) is None :
156
+ return root
157
+ root = deepcopy (root )
158
+ # Find parent elements that contain `sup` elements.
159
+ for parent in root .findall ('.//sup/..' ):
160
+ carry_text = ""
161
+ for child in reversed (parent ): # Reversed for the ability to mutate during iteration.
162
+ # Remove matching footnote references but carry any `tail` text to preceding elements.
163
+ if child .tag == 'sup' and child .get ('id' , '' ).startswith ('fnref' ):
164
+ carry_text = f'{ child .tail or "" } { carry_text } '
165
+ parent .remove (child )
166
+ elif carry_text :
167
+ child .tail = f'{ child .tail or "" } { carry_text } '
168
+ carry_text = ""
169
+ if carry_text :
170
+ parent .text = f'{ parent .text or "" } { carry_text } '
171
+ return root
172
+
173
+
101
174
def nest_toc_tokens (toc_list ):
102
175
"""Given an unsorted list with errors and skips, return a nested one.
103
176
@@ -300,27 +373,30 @@ def run(self, doc: etree.Element) -> None:
300
373
for el in doc .iter ():
301
374
if isinstance (el .tag , str ) and self .header_rgx .match (el .tag ):
302
375
self .set_level (el )
303
- text = get_name (el )
376
+ innerhtml = render_inner_html (remove_fnrefs (el ), self .md )
377
+ name = strip_tags (innerhtml )
304
378
305
379
# Do not override pre-existing ids
306
380
if "id" not in el .attrib :
307
- innertext = unescape (stashedHTML2text (text , self .md ))
308
- el .attrib ["id" ] = unique (self .slugify (innertext , self .sep ), used_ids )
381
+ el .attrib ["id" ] = unique (self .slugify (html .unescape (name ), self .sep ), used_ids )
382
+
383
+ data_toc_label = ''
384
+ if 'data-toc-label' in el .attrib :
385
+ data_toc_label = run_postprocessors (unescape (el .attrib ['data-toc-label' ]), self .md )
386
+ # Overwrite name with sanitized value of `data-toc-label`.
387
+ name = escape_cdata (strip_tags (data_toc_label ))
388
+ # Remove the data-toc-label attribute as it is no longer needed
389
+ del el .attrib ['data-toc-label' ]
309
390
310
391
if int (el .tag [- 1 ]) >= self .toc_top and int (el .tag [- 1 ]) <= self .toc_bottom :
311
392
toc_tokens .append ({
312
393
'level' : int (el .tag [- 1 ]),
313
394
'id' : el .attrib ["id" ],
314
- 'name' : unescape (stashedHTML2text (
315
- code_escape (el .attrib .get ('data-toc-label' , text )),
316
- self .md , strip_entities = False
317
- ))
395
+ 'name' : name ,
396
+ 'html' : innerhtml ,
397
+ 'data-toc-label' : data_toc_label
318
398
})
319
399
320
- # Remove the data-toc-label attribute as it is no longer needed
321
- if 'data-toc-label' in el .attrib :
322
- del el .attrib ['data-toc-label' ]
323
-
324
400
if self .use_anchors :
325
401
self .add_anchor (el , el .attrib ["id" ])
326
402
if self .use_permalinks not in [False , None ]:
0 commit comments