Skip to content

Search: recursively parse sections #7207

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 23, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 31 additions & 7 deletions readthedocs/search/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,13 @@ def _parse_sections(self, title, body):
body = self._clean_body(body)

# Index content for pages that don't start with a title.
# We check for sections till 2 levels to avoid indexing all the content
# in this step.
try:
content = self._parse_section_content(body.child)
content, _ = self._parse_section_content(
body.child,
depth=2,
)
if content:
yield {
'id': '',
Expand All @@ -61,7 +66,7 @@ def _parse_sections(self, title, body):
for tag in tags:
try:
title, id = self._parse_section_title(tag)
content = self._parse_section_content(tag.next)
content, _ = self._parse_section_content(tag.next)
yield {
'id': id,
'title': title,
Expand Down Expand Up @@ -131,20 +136,39 @@ def _parse_section_title(self, tag):

return self._parse_content(tag.text()), section_id

def _parse_section_content(self, tag):
"""Gets the content from tag till before a new section."""
def _parse_section_content(self, tag, *, depth=0):
"""
Gets the content from tag till before a new section.

if depth > 0, recursively check for sections in all tag's children.

Returns a tuple with: the parsed content,
and a boolean indicating if a section was found.
"""
contents = []
section_found = False

next_tag = tag
while next_tag and not self._is_section(next_tag):
while next_tag:
if section_found or self._is_section(next_tag):
section_found = True
break

if self._is_code_section(next_tag):
content = self._parse_code_section(next_tag)
else:
elif depth <= 0 or not next_tag.child:
content = self._parse_content(next_tag.text())
else:
content, section_found = self._parse_section_content(
tag=next_tag.child,
depth=depth - 1
)

if content:
contents.append(content)
next_tag = next_tag.next
return ' '.join(contents)

return ' '.join(contents), section_found

def _is_code_section(self, tag):
"""
Expand Down