@@ -22,33 +22,114 @@ def __init__(self, version):
22
22
self .storage = get_storage_class (settings .RTD_BUILD_MEDIA_STORAGE )()
23
23
24
24
def _parse_content (self , content ):
25
- """Removes new line characters and posible anchors."""
26
- content = content .replace ('¶' , '' ).strip ()
27
- content = content .split ('\n ' )
25
+ """Removes new line characters and strips all whitespaces."""
26
+ content = content .strip ().split ('\n ' )
28
27
29
28
# Convert all new lines to " "
30
29
content = (text .strip () for text in content )
31
30
content = ' ' .join (text for text in content if text )
32
31
return content
33
32
33
+ def _parse_sections (self , title , body ):
34
+ """
35
+ Parses each section into a structured dict.
36
+
37
+ Sub-sections are nested, so they are children of the outer section,
38
+ and sections with the same level are neighbors.
39
+ We index the content under a section till before the next one.
40
+
41
+ We can have pages that have content before the first title or that don't have a title,
42
+ we index that content first under the title of the original page.
43
+ """
44
+ body = self ._clean_body (body )
45
+
46
+ # Index content for pages that don't start with a title.
47
+ try :
48
+ content = self ._parse_section_content (body .child )
49
+ if content :
50
+ yield {
51
+ 'id' : '' ,
52
+ 'title' : title ,
53
+ 'content' : content ,
54
+ }
55
+ except Exception as e :
56
+ log .info ('Unable to index section: %s' , str (e ))
57
+
58
+ # Index content from h1 to h6 headers.
59
+ for head_level in range (1 , 7 ):
60
+ tags = body .css (f'h{ head_level } ' )
61
+ for tag in tags :
62
+ try :
63
+ title , id = self ._parse_section_title (tag )
64
+ content = self ._parse_section_content (tag .next )
65
+ yield {
66
+ 'id' : id ,
67
+ 'title' : title ,
68
+ 'content' : content ,
69
+ }
70
+ except Exception as e :
71
+ log .info ('Unable to index section: %s' , str (e ))
72
+
73
+ def _clean_body (self , body ):
74
+ """
75
+ Removes nodes with irrelevant content before parsing its sections.
76
+
77
+ .. warning::
78
+
79
+ This will mutate the original `body`.
80
+ """
81
+ # Remove all navigation nodes
82
+ nodes_to_be_removed = body .css ('[role=navigation]' )
83
+ for node in nodes_to_be_removed :
84
+ node .decompose ()
85
+
86
+ return body
87
+
34
88
def _is_section (self , tag ):
35
- """Check if `tag` is a section (linkeable header)."""
89
+ """
90
+ Check if `tag` is a section (linkeable header).
91
+
92
+ The tag is a section if:
93
+
94
+ - It's a ``h`` tag.
95
+ - It's a div with a ``section`` class.
96
+ """
97
+ is_header_tag = re .match (r'h\d$' , tag .tag )
98
+ if is_header_tag :
99
+ return True
100
+
36
101
is_div_section = (
37
102
tag .tag == 'div' and
38
103
'section' in tag .attributes .get ('class' , '' ).split ()
39
104
)
40
- return is_div_section
105
+ if is_div_section :
106
+ return True
107
+
108
+ return False
41
109
42
110
def _parse_section_title (self , tag ):
43
111
"""
44
- Parses a section title tag.
112
+ Parses a section title tag and gets its id.
113
+
114
+ The id (used to link to the section) is tested in the following order:
115
+
116
+ - Get the id from the node itself.
117
+ - Get the id from the parent node.
118
+
119
+ Additionally:
45
120
46
- - Removes the permalink value
121
+ - Removes permalink values
47
122
"""
48
123
nodes_to_be_removed = tag .css ('a.headerlink' )
49
124
for node in nodes_to_be_removed :
50
125
node .decompose ()
51
- return self ._parse_content (tag .text ())
126
+
127
+ section_id = tag .attributes .get ('id' , '' )
128
+ if not section_id :
129
+ parent = tag .parent
130
+ section_id = parent .attributes .get ('id' , '' )
131
+
132
+ return self ._parse_content (tag .text ()), section_id
52
133
53
134
def _parse_section_content (self , tag ):
54
135
"""Gets the content from tag till before a new section."""
@@ -200,11 +281,7 @@ def _process_fjson(self, fjson_path):
200
281
if 'body' in data :
201
282
try :
202
283
body = HTMLParser (data ['body' ])
203
- sections = self ._generate_sections (
204
- page_title = title ,
205
- body = body ,
206
- )
207
- sections = list (sections )
284
+ sections = list (self ._parse_sections (title = title , body = body .body ))
208
285
except Exception as e :
209
286
log .info ('Unable to index sections for: %s' , fjson_path )
210
287
@@ -224,22 +301,16 @@ def _process_fjson(self, fjson_path):
224
301
'domain_data' : domain_data ,
225
302
}
226
303
227
- def _generate_sections (self , page_title , body ):
304
+ def _clean_body (self , body ):
228
305
"""
229
- Generates section dicts for each section for Sphinx.
230
-
231
- In Sphinx sub-sections are nested, so they are children of the outer section,
232
- and sections with the same level are neighbors.
233
- We index the content under a section till before the next one.
306
+ Removes sphinx domain nodes.
234
307
235
- We can have pages that have content before the first title or that don't have a title,
236
- we index that content first under the title of the original page (`page_title`).
237
-
238
- Contents that are likely to be a sphinx domain are deleted,
239
- since we already index those in another step.
308
+ This method is overriden to remove contents that are likely
309
+ to be a sphinx domain (`dl` tags).
310
+ We already index those in another step.
240
311
"""
312
+ body = super ()._clean_body (body )
241
313
242
- # Removing all <dl> tags to prevent duplicate indexing with Sphinx Domains.
243
314
nodes_to_be_removed = []
244
315
245
316
# remove all <dl> tags which contains <dt> tags having 'id' attribute
@@ -249,36 +320,15 @@ def _generate_sections(self, page_title, body):
249
320
if parent .tag == 'dl' :
250
321
nodes_to_be_removed .append (parent )
251
322
323
+ # TODO: see if we really need to remove these
252
324
# remove `Table of Contents` elements
253
325
nodes_to_be_removed += body .css ('.toctree-wrapper' ) + body .css ('.contents.local.topic' )
254
326
255
327
# removing all nodes in list
256
328
for node in nodes_to_be_removed :
257
329
node .decompose ()
258
330
259
- # Index content for pages that don't start with a title.
260
- content = self ._parse_section_content (body .body .child )
261
- if content :
262
- yield {
263
- 'id' : '' ,
264
- 'title' : page_title ,
265
- 'content' : content ,
266
- }
267
-
268
- # Index content from h1 to h6 headers.
269
- for head_level in range (1 , 7 ):
270
- tags = body .css (f'.section > h{ head_level } ' )
271
- for tag in tags :
272
- title = self ._parse_section_title (tag )
273
-
274
- div = tag .parent
275
- section_id = div .attributes .get ('id' , '' )
276
-
277
- yield {
278
- 'id' : section_id ,
279
- 'title' : title ,
280
- 'content' : self ._parse_section_content (tag .next ),
281
- }
331
+ return body
282
332
283
333
def _generate_domains_data (self , body ):
284
334
"""
0 commit comments