Skip to content

Commit ec9022c

Browse files
authored
Search: recursively parse sections (#7207)
If we have an structure like - parent - content - content - h1 - content - content - h2 - content And we start indexing from `parent`, we will index all children in the first step, and then index each header later. This is, duplicating content. This is solved by checking for a section till 1 level. In this example, the first parsing will stop when finding the first h1, not duplicating content. Later it will index the next nodes as usual.
1 parent 5080adc commit ec9022c

File tree

1 file changed

+31
-7
lines changed

1 file changed

+31
-7
lines changed

readthedocs/search/parsers.py

+31-7
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,13 @@ def _parse_sections(self, title, body):
4444
body = self._clean_body(body)
4545

4646
# Index content for pages that don't start with a title.
47+
# We check for sections till 2 levels to avoid indexing all the content
48+
# in this step.
4749
try:
48-
content = self._parse_section_content(body.child)
50+
content, _ = self._parse_section_content(
51+
body.child,
52+
depth=2,
53+
)
4954
if content:
5055
yield {
5156
'id': '',
@@ -61,7 +66,7 @@ def _parse_sections(self, title, body):
6166
for tag in tags:
6267
try:
6368
title, id = self._parse_section_title(tag)
64-
content = self._parse_section_content(tag.next)
69+
content, _ = self._parse_section_content(tag.next)
6570
yield {
6671
'id': id,
6772
'title': title,
@@ -131,20 +136,39 @@ def _parse_section_title(self, tag):
131136

132137
return self._parse_content(tag.text()), section_id
133138

134-
def _parse_section_content(self, tag):
135-
"""Gets the content from tag till before a new section."""
139+
def _parse_section_content(self, tag, *, depth=0):
140+
"""
141+
Gets the content from tag till before a new section.
142+
143+
if depth > 0, recursively check for sections in all tag's children.
144+
145+
Returns a tuple with: the parsed content,
146+
and a boolean indicating if a section was found.
147+
"""
136148
contents = []
149+
section_found = False
150+
137151
next_tag = tag
138-
while next_tag and not self._is_section(next_tag):
152+
while next_tag:
153+
if section_found or self._is_section(next_tag):
154+
section_found = True
155+
break
156+
139157
if self._is_code_section(next_tag):
140158
content = self._parse_code_section(next_tag)
141-
else:
159+
elif depth <= 0 or not next_tag.child:
142160
content = self._parse_content(next_tag.text())
161+
else:
162+
content, section_found = self._parse_section_content(
163+
tag=next_tag.child,
164+
depth=depth - 1
165+
)
143166

144167
if content:
145168
contents.append(content)
146169
next_tag = next_tag.next
147-
return ' '.join(contents)
170+
171+
return ' '.join(contents), section_found
148172

149173
def _is_code_section(self, tag):
150174
"""

0 commit comments

Comments
 (0)