Skip to content

Search: improve parser #7233

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Aug 17, 2020
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 22 additions & 26 deletions readthedocs/search/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def _get_main_node(self, html):

- Try with a tag with the ``main`` role.
This role is used by several static sites and themes.
- Try the ``main`` tag.
- Try the first ``h1`` node and return its parent
Usually all sections are neighbors,
so they are children of the same parent node.
Expand All @@ -75,6 +76,10 @@ def _get_main_node(self, html):
if main_node:
return main_node

main_node = body.css_first('main')
if main_node:
return main_node

# TODO: this could be done in smarter way,
# checking for common parents between all h nodes.
first_header = body.css_first('h1')
Expand All @@ -84,10 +89,8 @@ def _get_main_node(self, html):
return None

def _parse_content(self, content):
"""Removes new line characters and strips all whitespaces."""
content = content.strip().split('\n')

# Convert all new lines to " "
"""Converts all new line characters and multiple spaces to a single space."""
content = content.strip().split()
content = (text.strip() for text in content)
content = ' '.join(text for text in content if text)
return content
Expand All @@ -106,12 +109,12 @@ def _parse_sections(self, title, body):
body = self._clean_body(body)

# Index content for pages that don't start with a title.
# We check for sections till 2 levels to avoid indexing all the content
# We check for sections till 3 levels to avoid indexing all the content
# in this step.
try:
content, _ = self._parse_section_content(
body.child,
depth=2,
depth=3,
)
if content:
yield {
Expand All @@ -128,7 +131,7 @@ def _parse_sections(self, title, body):
for tag in tags:
try:
title, id = self._parse_section_title(tag)
content, _ = self._parse_section_content(tag.next)
content, _ = self._parse_section_content(tag.next, depth=2)
yield {
'id': id,
'title': title,
Expand All @@ -146,7 +149,11 @@ def _clean_body(self, body):
This will mutate the original `body`.
"""
# Remove all navigation nodes
nodes_to_be_removed = body.css('[role=navigation]')
nodes_to_be_removed = itertools.chain(
body.css('nav'),
body.css('[role=navigation]'),
body.css('[role=search]'),
)
for node in nodes_to_be_removed:
node.decompose()

Expand All @@ -156,23 +163,10 @@ def _is_section(self, tag):
"""
Check if `tag` is a section (linkeable header).

The tag is a section if:

- It's a ``h`` tag.
- It's a div with a ``section`` class.
The tag is a section if it's a ``h`` tag.
"""
is_header_tag = re.match(r'h\d$', tag.tag)
if is_header_tag:
return True

is_div_section = (
tag.tag == 'div' and
'section' in tag.attributes.get('class', '').split()
)
if is_div_section:
return True

return False
return is_header_tag

def _parse_section_title(self, tag):
"""
Expand All @@ -187,7 +181,7 @@ def _parse_section_title(self, tag):

- Removes permalink values
"""
nodes_to_be_removed = tag.css('a.headerlink')
nodes_to_be_removed = tag.css('.headerlink')
for node in nodes_to_be_removed:
node.decompose()

Expand Down Expand Up @@ -219,7 +213,9 @@ def _parse_section_content(self, tag, *, depth=0):
if self._is_code_section(next_tag):
content = self._parse_code_section(next_tag)
elif depth <= 0 or not next_tag.child:
content = self._parse_content(next_tag.text())
# Calling .text() with deep `True` over a text node will return empty.
deep = next_tag.tag != '-text'
content = next_tag.text(deep=deep)
else:
content, section_found = self._parse_section_content(
tag=next_tag.child,
Expand All @@ -230,7 +226,7 @@ def _parse_section_content(self, tag, *, depth=0):
contents.append(content)
next_tag = next_tag.next

return ' '.join(contents), section_found
return self._parse_content(''.join(contents)), section_found

def _is_code_section(self, tag):
"""
Expand Down
2 changes: 1 addition & 1 deletion readthedocs/search/tests/data/mkdocs/out/mkdocs-1.1.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
{
"id": "mkdocs",
"title": "MkDocs",
"content": "Project documentation with\u00a0Markdown."
"content": "Project documentation with Markdown."
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No more weird chars now that we are stripping all white spaces :D

},
{
"id": "overview",
Expand Down
2 changes: 1 addition & 1 deletion readthedocs/search/tests/data/sphinx/out/page.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
{
"id": "adding-a-new-scenario-to-the-repository",
"title": "Adding a new scenario to the repository",
"content": "Sphinx configuration file used to build this docs: # -*- coding: utf-8 -*- # Default settings project = 'Test Builds' extensions = [ 'sphinx_autorun', ] latex_engine = 'xelatex' # allow us to build Unicode chars # Include all your settings here html_theme = 'sphinx_rtd_theme' >>> # Build at >>> import datetime >>> datetime.datetime.utcnow() # UTC datetime.datetime(2020, 5, 3, 16, 38, 11, 137311)"
"content": "Sphinx configuration file used to build this docs: # -*- coding: utf-8 -*- # Default settings project = 'Test Builds' extensions = [ 'sphinx_autorun', ] latex_engine = 'xelatex' # allow us to build Unicode chars # Include all your settings here html_theme = 'sphinx_rtd_theme' >>> # Build at >>> import datetime >>> datetime.datetime.utcnow() # UTC datetime.datetime(2020, 5, 3, 16, 38, 11, 137311)"
},
{
"content": "This is a H3 title.",
Expand Down