Skip to content

Commit 4638167

Browse files
authored
Search: improve parser (#7233)
* Search: improve parser This will make the parser more general and match #7232 (also, one bug fix). - Try the main tag before trying the first h1 - Always inspect all headers till 2 levels (this removes the need for the special case from Sphinx, where the h tag is inside a div) - `_parse_content` now not only removes all new line chars, but it also reduces multiple spaces into one. - Remove elements with the search role in addition to the navigation role. - The headerlink class doesn't need to be inside an `a` tag. - Fix bug where calling .text() over a text node will return empty. (I was able to catch this one now that we are checking till 2 levels) * Increase depth Now that we prioritize the main tag as main node, the main node from the mkdocs material theme is more wide. * Strip spaces
1 parent 9dab1d2 commit 4638167

File tree

3 files changed

+24
-28
lines changed

3 files changed

+24
-28
lines changed

readthedocs/search/parsers.py

+22-26
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ def _get_main_node(self, html):
6666
6767
- Try with a tag with the ``main`` role.
6868
This role is used by several static sites and themes.
69+
- Try the ``main`` tag.
6970
- Try the first ``h1`` node and return its parent
7071
Usually all sections are neighbors,
7172
so they are children of the same parent node.
@@ -75,6 +76,10 @@ def _get_main_node(self, html):
7576
if main_node:
7677
return main_node
7778

79+
main_node = body.css_first('main')
80+
if main_node:
81+
return main_node
82+
7883
# TODO: this could be done in smarter way,
7984
# checking for common parents between all h nodes.
8085
first_header = body.css_first('h1')
@@ -84,10 +89,8 @@ def _get_main_node(self, html):
8489
return None
8590

8691
def _parse_content(self, content):
87-
"""Removes new line characters and strips all whitespaces."""
88-
content = content.strip().split('\n')
89-
90-
# Convert all new lines to " "
92+
"""Converts all new line characters and multiple spaces to a single space."""
93+
content = content.strip().split()
9194
content = (text.strip() for text in content)
9295
content = ' '.join(text for text in content if text)
9396
return content
@@ -106,12 +109,12 @@ def _parse_sections(self, title, body):
106109
body = self._clean_body(body)
107110

108111
# Index content for pages that don't start with a title.
109-
# We check for sections till 2 levels to avoid indexing all the content
112+
# We check for sections till 3 levels to avoid indexing all the content
110113
# in this step.
111114
try:
112115
content, _ = self._parse_section_content(
113116
body.child,
114-
depth=2,
117+
depth=3,
115118
)
116119
if content:
117120
yield {
@@ -128,7 +131,7 @@ def _parse_sections(self, title, body):
128131
for tag in tags:
129132
try:
130133
title, id = self._parse_section_title(tag)
131-
content, _ = self._parse_section_content(tag.next)
134+
content, _ = self._parse_section_content(tag.next, depth=2)
132135
yield {
133136
'id': id,
134137
'title': title,
@@ -146,7 +149,11 @@ def _clean_body(self, body):
146149
This will mutate the original `body`.
147150
"""
148151
# Remove all navigation nodes
149-
nodes_to_be_removed = body.css('[role=navigation]')
152+
nodes_to_be_removed = itertools.chain(
153+
body.css('nav'),
154+
body.css('[role=navigation]'),
155+
body.css('[role=search]'),
156+
)
150157
for node in nodes_to_be_removed:
151158
node.decompose()
152159

@@ -156,23 +163,10 @@ def _is_section(self, tag):
156163
"""
157164
Check if `tag` is a section (linkeable header).
158165
159-
The tag is a section if:
160-
161-
- It's a ``h`` tag.
162-
- It's a div with a ``section`` class.
166+
The tag is a section if it's a ``h`` tag.
163167
"""
164168
is_header_tag = re.match(r'h\d$', tag.tag)
165-
if is_header_tag:
166-
return True
167-
168-
is_div_section = (
169-
tag.tag == 'div' and
170-
'section' in tag.attributes.get('class', '').split()
171-
)
172-
if is_div_section:
173-
return True
174-
175-
return False
169+
return is_header_tag
176170

177171
def _parse_section_title(self, tag):
178172
"""
@@ -187,7 +181,7 @@ def _parse_section_title(self, tag):
187181
188182
- Removes permalink values
189183
"""
190-
nodes_to_be_removed = tag.css('a.headerlink')
184+
nodes_to_be_removed = tag.css('.headerlink')
191185
for node in nodes_to_be_removed:
192186
node.decompose()
193187

@@ -219,7 +213,9 @@ def _parse_section_content(self, tag, *, depth=0):
219213
if self._is_code_section(next_tag):
220214
content = self._parse_code_section(next_tag)
221215
elif depth <= 0 or not next_tag.child:
222-
content = self._parse_content(next_tag.text())
216+
# Calling .text() with deep `True` over a text node will return empty.
217+
deep = next_tag.tag != '-text'
218+
content = next_tag.text(deep=deep)
223219
else:
224220
content, section_found = self._parse_section_content(
225221
tag=next_tag.child,
@@ -230,7 +226,7 @@ def _parse_section_content(self, tag, *, depth=0):
230226
contents.append(content)
231227
next_tag = next_tag.next
232228

233-
return ' '.join(contents), section_found
229+
return self._parse_content(''.join(contents)), section_found
234230

235231
def _is_code_section(self, tag):
236232
"""

readthedocs/search/tests/data/mkdocs/out/mkdocs-1.1.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
{
77
"id": "mkdocs",
88
"title": "MkDocs",
9-
"content": "Project documentation with\u00a0Markdown."
9+
"content": "Project documentation with Markdown."
1010
},
1111
{
1212
"id": "overview",

readthedocs/search/tests/data/sphinx/out/page.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
{
2626
"id": "adding-a-new-scenario-to-the-repository",
2727
"title": "Adding a new scenario to the repository",
28-
"content": "Sphinx configuration file used to build this docs: # -*- coding: utf-8 -*- # Default settings project = 'Test Builds' extensions = [ 'sphinx_autorun', ] latex_engine = 'xelatex' # allow us to build Unicode chars # Include all your settings here html_theme = 'sphinx_rtd_theme' >>> # Build at >>> import datetime >>> datetime.datetime.utcnow() # UTC datetime.datetime(2020, 5, 3, 16, 38, 11, 137311)"
28+
"content": "Sphinx configuration file used to build this docs: # -*- coding: utf-8 -*- # Default settings project = 'Test Builds' extensions = [ 'sphinx_autorun', ] latex_engine = 'xelatex' # allow us to build Unicode chars # Include all your settings here html_theme = 'sphinx_rtd_theme' >>> # Build at >>> import datetime >>> datetime.datetime.utcnow() # UTC datetime.datetime(2020, 5, 3, 16, 38, 11, 137311)"
2929
},
3030
{
3131
"content": "This is a H3 title.",

0 commit comments

Comments
 (0)