Skip to content

Commit 5080adc

Browse files
authored
Search: more general parser for html (#7204)
* Search: more general parser for html This is a step further for indexing html files from any static site generator. * Fix tests * Remove all navigation roles
1 parent 808cb13 commit 5080adc

File tree

1 file changed

+98
-48
lines changed

1 file changed

+98
-48
lines changed

readthedocs/search/parsers.py

+98-48
Original file line numberDiff line numberDiff line change
@@ -22,33 +22,114 @@ def __init__(self, version):
2222
self.storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()
2323

2424
def _parse_content(self, content):
25-
"""Removes new line characters and posible anchors."""
26-
content = content.replace('¶', '').strip()
27-
content = content.split('\n')
25+
"""Removes new line characters and strips all whitespaces."""
26+
content = content.strip().split('\n')
2827

2928
# Convert all new lines to " "
3029
content = (text.strip() for text in content)
3130
content = ' '.join(text for text in content if text)
3231
return content
3332

33+
def _parse_sections(self, title, body):
34+
"""
35+
Parses each section into a structured dict.
36+
37+
Sub-sections are nested, so they are children of the outer section,
38+
and sections with the same level are neighbors.
39+
We index the content under a section till before the next one.
40+
41+
We can have pages that have content before the first title or that don't have a title,
42+
we index that content first under the title of the original page.
43+
"""
44+
body = self._clean_body(body)
45+
46+
# Index content for pages that don't start with a title.
47+
try:
48+
content = self._parse_section_content(body.child)
49+
if content:
50+
yield {
51+
'id': '',
52+
'title': title,
53+
'content': content,
54+
}
55+
except Exception as e:
56+
log.info('Unable to index section: %s', str(e))
57+
58+
# Index content from h1 to h6 headers.
59+
for head_level in range(1, 7):
60+
tags = body.css(f'h{head_level}')
61+
for tag in tags:
62+
try:
63+
title, id = self._parse_section_title(tag)
64+
content = self._parse_section_content(tag.next)
65+
yield {
66+
'id': id,
67+
'title': title,
68+
'content': content,
69+
}
70+
except Exception as e:
71+
log.info('Unable to index section: %s', str(e))
72+
73+
def _clean_body(self, body):
74+
"""
75+
Removes nodes with irrelevant content before parsing its sections.
76+
77+
.. warning::
78+
79+
This will mutate the original `body`.
80+
"""
81+
# Remove all navigation nodes
82+
nodes_to_be_removed = body.css('[role=navigation]')
83+
for node in nodes_to_be_removed:
84+
node.decompose()
85+
86+
return body
87+
3488
def _is_section(self, tag):
35-
"""Check if `tag` is a section (linkeable header)."""
89+
"""
90+
Check if `tag` is a section (linkeable header).
91+
92+
The tag is a section if:
93+
94+
- It's a ``h`` tag.
95+
- It's a div with a ``section`` class.
96+
"""
97+
is_header_tag = re.match(r'h\d$', tag.tag)
98+
if is_header_tag:
99+
return True
100+
36101
is_div_section = (
37102
tag.tag == 'div' and
38103
'section' in tag.attributes.get('class', '').split()
39104
)
40-
return is_div_section
105+
if is_div_section:
106+
return True
107+
108+
return False
41109

42110
def _parse_section_title(self, tag):
43111
"""
44-
Parses a section title tag.
112+
Parses a section title tag and gets its id.
113+
114+
The id (used to link to the section) is tested in the following order:
115+
116+
- Get the id from the node itself.
117+
- Get the id from the parent node.
118+
119+
Additionally:
45120
46-
- Removes the permalink value
121+
- Removes permalink values
47122
"""
48123
nodes_to_be_removed = tag.css('a.headerlink')
49124
for node in nodes_to_be_removed:
50125
node.decompose()
51-
return self._parse_content(tag.text())
126+
127+
section_id = tag.attributes.get('id', '')
128+
if not section_id:
129+
parent = tag.parent
130+
section_id = parent.attributes.get('id', '')
131+
132+
return self._parse_content(tag.text()), section_id
52133

53134
def _parse_section_content(self, tag):
54135
"""Gets the content from tag till before a new section."""
@@ -200,11 +281,7 @@ def _process_fjson(self, fjson_path):
200281
if 'body' in data:
201282
try:
202283
body = HTMLParser(data['body'])
203-
sections = self._generate_sections(
204-
page_title=title,
205-
body=body,
206-
)
207-
sections = list(sections)
284+
sections = list(self._parse_sections(title=title, body=body.body))
208285
except Exception as e:
209286
log.info('Unable to index sections for: %s', fjson_path)
210287

@@ -224,22 +301,16 @@ def _process_fjson(self, fjson_path):
224301
'domain_data': domain_data,
225302
}
226303

227-
def _generate_sections(self, page_title, body):
304+
def _clean_body(self, body):
228305
"""
229-
Generates section dicts for each section for Sphinx.
230-
231-
In Sphinx sub-sections are nested, so they are children of the outer section,
232-
and sections with the same level are neighbors.
233-
We index the content under a section till before the next one.
306+
Removes sphinx domain nodes.
234307
235-
We can have pages that have content before the first title or that don't have a title,
236-
we index that content first under the title of the original page (`page_title`).
237-
238-
Contents that are likely to be a sphinx domain are deleted,
239-
since we already index those in another step.
308+
This method is overriden to remove contents that are likely
309+
to be a sphinx domain (`dl` tags).
310+
We already index those in another step.
240311
"""
312+
body = super()._clean_body(body)
241313

242-
# Removing all <dl> tags to prevent duplicate indexing with Sphinx Domains.
243314
nodes_to_be_removed = []
244315

245316
# remove all <dl> tags which contains <dt> tags having 'id' attribute
@@ -249,36 +320,15 @@ def _generate_sections(self, page_title, body):
249320
if parent.tag == 'dl':
250321
nodes_to_be_removed.append(parent)
251322

323+
# TODO: see if we really need to remove these
252324
# remove `Table of Contents` elements
253325
nodes_to_be_removed += body.css('.toctree-wrapper') + body.css('.contents.local.topic')
254326

255327
# removing all nodes in list
256328
for node in nodes_to_be_removed:
257329
node.decompose()
258330

259-
# Index content for pages that don't start with a title.
260-
content = self._parse_section_content(body.body.child)
261-
if content:
262-
yield {
263-
'id': '',
264-
'title': page_title,
265-
'content': content,
266-
}
267-
268-
# Index content from h1 to h6 headers.
269-
for head_level in range(1, 7):
270-
tags = body.css(f'.section > h{head_level}')
271-
for tag in tags:
272-
title = self._parse_section_title(tag)
273-
274-
div = tag.parent
275-
section_id = div.attributes.get('id', '')
276-
277-
yield {
278-
'id': section_id,
279-
'title': title,
280-
'content': self._parse_section_content(tag.next),
281-
}
331+
return body
282332

283333
def _generate_domains_data(self, body):
284334
"""

0 commit comments

Comments
 (0)