Skip to content

Commit a182899

Browse files
committed
Build: refactor search indexing process
Currently, we walk the entire project directory to apply two operations: index files in ES, and keep track of index/404 files. These two operations are independent, but in our code they are kind of mixed together in order to avoid walking the project directory twice. I have abstracted the processing of the files with a "Indexer" class, which is responsible for doing an operation on a file, and at the end it can collect the results.
1 parent ed7c06e commit a182899

File tree

2 files changed

+196
-132
lines changed

2 files changed

+196
-132
lines changed

readthedocs/projects/tasks/search.py

+193-130
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,193 @@
1414
log = structlog.get_logger(__name__)
1515

1616

17+
class Indexer:
18+
19+
"""
20+
Base class for doing operations over each file from a build.
21+
22+
The process method should be implemented to apply the operation
23+
over each file, and the collect method should be implemented
24+
to collect the results of the operation after processing all files.
25+
26+
`sync_id` is used to differentiate the files from the current sync from the previous one.
27+
"""
28+
29+
def process(self, html_file: HTMLFile, sync_id: int):
30+
raise NotImplementedError
31+
32+
def collect(self, sync_id: int):
33+
raise NotImplementedError
34+
35+
36+
class SearchIndexer(Indexer):
37+
38+
"""
39+
Index HTML files in ElasticSearch.
40+
41+
We respect the search ranking and ignore patterns defined in the project's search configuration.
42+
43+
If search_index_name is provided, it will be used as the search index name,
44+
otherwise the default one will be used.
45+
"""
46+
47+
def __init__(
48+
self,
49+
project: Project,
50+
version: Version,
51+
search_ranking: dict[str, int],
52+
search_ignore: list[str],
53+
search_index_name: str | None = None,
54+
):
55+
self.project = project
56+
self.version = version
57+
self.search_ranking = search_ranking
58+
self.search_ignore = search_ignore
59+
self._reversed_search_ranking = list(reversed(search_ranking.items()))
60+
self.search_index_name = search_index_name
61+
self._html_files_to_index = []
62+
63+
def process(self, html_file: HTMLFile, sync_id: int):
64+
for pattern in self.search_ignore:
65+
if fnmatch(html_file.path, pattern):
66+
return
67+
68+
for pattern, rank in self._reversed_search_ranking:
69+
if fnmatch(html_file.path, pattern):
70+
html_file.rank = rank
71+
break
72+
73+
self._html_files_to_index.append(html_file)
74+
75+
def collect(self, sync_id: int):
76+
# Index new files in ElasticSearch.
77+
if self._html_files_to_index:
78+
index_objects(
79+
document=PageDocument,
80+
objects=self._html_files_to_index,
81+
index_name=self.search_index_name,
82+
# Pages are indexed in small chunks to avoid a
83+
# large payload that will probably timeout ES.
84+
chunk_size=100,
85+
)
86+
87+
# Remove old HTMLFiles from ElasticSearch.
88+
remove_indexed_files(
89+
project_slug=self.project.slug,
90+
version_slug=self.version.slug,
91+
sync_id=sync_id,
92+
index_name=self.search_index_name,
93+
)
94+
95+
96+
class IndexFileIndexer(Indexer):
97+
98+
"""
99+
Create imported files of interest in the DB.
100+
101+
We only save the top-level 404 file and index files,
102+
we don't need to keep track of all files.
103+
These files are queried by proxito instead of checking S3 (slow).
104+
"""
105+
106+
def __init__(self, project: Project, version: Version):
107+
self.project = project
108+
self.version = version
109+
self._html_files_to_save = []
110+
111+
def process(self, html_file: HTMLFile, sync_id: int):
112+
if html_file.path == "404.html" or html_file.name == "index.html":
113+
self._html_files_to_save.append(html_file)
114+
115+
def collect(self, sync_id: int):
116+
if self._html_files_to_save:
117+
HTMLFile.objects.bulk_create(self._html_files_to_save)
118+
119+
# Delete imported files from the previous build of the version.
120+
self.version.imported_files.exclude(build=sync_id).delete()
121+
122+
123+
def _get_indexers(*, version, search_ranking, search_ignore, search_index_name=None):
124+
indexers = []
125+
# NOTE: The search indexer must be before the index file indexer.
126+
# This is because saving the objects in the DB will give them an id,
127+
# and we neeed this id to be `None` when indexing the objects in ES.
128+
# ES will generate a unique id for each document.
129+
# NOTE: If the version is external, we don't create a search index for it.
130+
if not version.is_external:
131+
search_indexer = SearchIndexer(
132+
project=version.project,
133+
version=version,
134+
search_ranking=search_ranking,
135+
search_ignore=search_ignore,
136+
search_index_name=search_index_name,
137+
)
138+
indexers.append(search_indexer)
139+
index_file_indexer = IndexFileIndexer(
140+
project=version.project,
141+
version=version,
142+
)
143+
indexers.append(index_file_indexer)
144+
return indexers
145+
146+
147+
def _process_files(*, version: Version, indexers: list[Indexer]):
148+
storage_path = version.project.get_storage_path(
149+
type_="html",
150+
version_slug=version.slug,
151+
include_file=False,
152+
version_type=version.type,
153+
)
154+
# A sync ID is a number different than the current `build` attribute (pending rename),
155+
# it's used to differentiate the files from the current sync from the previous one.
156+
# This is useful to easily delete the previous files from the DB and ES.
157+
# See https://github.com/readthedocs/readthedocs.org/issues/10734.
158+
imported_file_build_id = version.imported_files.values_list(
159+
"build", flat=True
160+
).first()
161+
sync_id = imported_file_build_id + 1 if imported_file_build_id else 1
162+
163+
log.debug(
164+
"Using sync ID for search indexing",
165+
sync_id=sync_id,
166+
)
167+
168+
for root, __, filenames in build_media_storage.walk(storage_path):
169+
for filename in filenames:
170+
# We don't care about non-HTML files (for now?).
171+
if not filename.endswith(".html"):
172+
continue
173+
174+
full_path = build_media_storage.join(root, filename)
175+
# Generate a relative path for storage similar to os.path.relpath
176+
relpath = full_path.removeprefix(storage_path).lstrip("/")
177+
178+
html_file = HTMLFile(
179+
project=version.project,
180+
version=version,
181+
path=relpath,
182+
name=filename,
183+
# TODO: We are setting the commit field since it's required,
184+
# but it isn't used, and will be removed in the future
185+
# together with other fields.
186+
commit="unknown",
187+
build=sync_id,
188+
)
189+
for indexer in indexers:
190+
indexer.process(html_file, sync_id)
191+
192+
for indexer in indexers:
193+
indexer.collect(sync_id)
194+
195+
# This signal is used for purging the CDN.
196+
files_changed.send(
197+
sender=Project,
198+
project=version.project,
199+
version=version,
200+
)
201+
return sync_id
202+
203+
17204
@app.task(queue="reindex")
18205
def index_build(build_id):
19206
"""Create imported files and search index for the build."""
@@ -49,13 +236,14 @@ def index_build(build_id):
49236
search_ignore = search_config.get("ignore", [])
50237

51238
try:
52-
_create_imported_files_and_search_index(
239+
indexers = _get_indexers(
53240
version=version,
54241
search_ranking=search_ranking,
55242
search_ignore=search_ignore,
56243
)
244+
_process_files(version=version, indexers=indexers)
57245
except Exception:
58-
log.exception("Failed during creation of new files")
246+
log.exception("Failed to index build")
59247

60248

61249
@app.task(queue="reindex")
@@ -99,14 +287,15 @@ def reindex_version(version_id, search_index_name=None):
99287
search_ignore = search_config.get("ignore", [])
100288

101289
try:
102-
_create_imported_files_and_search_index(
290+
indexers = _get_indexers(
103291
version=version,
104292
search_ranking=search_ranking,
105293
search_ignore=search_ignore,
106294
search_index_name=search_index_name,
107295
)
296+
_process_files(version=version, indexers=indexers)
108297
except Exception:
109-
log.exception("Failed during creation of new files")
298+
log.exception("Failed to re-index version")
110299

111300

112301
@app.task(queue="reindex")
@@ -141,129 +330,3 @@ def remove_search_indexes(project_slug, version_slug=None):
141330
project_slug=project_slug,
142331
version_slug=version_slug,
143332
)
144-
145-
146-
def _create_imported_files_and_search_index(
147-
*, version, search_ranking, search_ignore, search_index_name=None
148-
):
149-
"""
150-
Create imported files and search index for the build of the version.
151-
152-
If the version is external, we don't create a search index for it, only imported files.
153-
After the process is completed, we delete the files and search index that
154-
don't belong to the current build id.
155-
156-
:param search_index: If provided, it will be used as the search index name,
157-
otherwise the default one will be used.
158-
"""
159-
storage_path = version.project.get_storage_path(
160-
type_="html",
161-
version_slug=version.slug,
162-
include_file=False,
163-
version_type=version.type,
164-
)
165-
# A sync ID is a number different than the current `build` attribute (pending rename),
166-
# it's used to differentiate the files from the current sync from the previous one.
167-
# This is useful to easily delete the previous files from the DB and ES.
168-
# See https://github.com/readthedocs/readthedocs.org/issues/10734.
169-
imported_file_build_id = version.imported_files.values_list(
170-
"build", flat=True
171-
).first()
172-
sync_id = imported_file_build_id + 1 if imported_file_build_id else 1
173-
174-
log.debug(
175-
"Using sync ID for search indexing",
176-
sync_id=sync_id,
177-
)
178-
179-
html_files_to_index = []
180-
html_files_to_save = []
181-
reverse_rankings = list(reversed(search_ranking.items()))
182-
for root, __, filenames in build_media_storage.walk(storage_path):
183-
for filename in filenames:
184-
# We don't care about non-HTML files
185-
if not filename.endswith(".html"):
186-
continue
187-
188-
full_path = build_media_storage.join(root, filename)
189-
190-
# Generate a relative path for storage similar to os.path.relpath
191-
relpath = full_path.replace(storage_path, "", 1).lstrip("/")
192-
193-
skip_search_index = False
194-
if version.is_external:
195-
# Never index files from external versions.
196-
skip_search_index = True
197-
else:
198-
for pattern in search_ignore:
199-
if fnmatch(relpath, pattern):
200-
skip_search_index = True
201-
break
202-
203-
page_rank = 0
204-
# If the file is ignored, we don't need to check for its ranking.
205-
if not skip_search_index:
206-
# Last pattern to match takes precedence
207-
for pattern, rank in reverse_rankings:
208-
if fnmatch(relpath, pattern):
209-
page_rank = rank
210-
break
211-
212-
html_file = HTMLFile(
213-
project=version.project,
214-
version=version,
215-
path=relpath,
216-
name=filename,
217-
rank=page_rank,
218-
# TODO: We are setting the commit field since it's required,
219-
# but it isn't used, and will be removed in the future
220-
# together with other fields.
221-
commit="unknown",
222-
build=sync_id,
223-
ignore=skip_search_index,
224-
)
225-
226-
if not skip_search_index:
227-
html_files_to_index.append(html_file)
228-
229-
# Create the imported file only if it's a top-level 404 file,
230-
# or if it's an index file. We don't need to keep track of all files.
231-
tryfiles = ["index.html"]
232-
if relpath == "404.html" or filename in tryfiles:
233-
html_files_to_save.append(html_file)
234-
235-
# We first index the files in ES, and then save the objects in the DB.
236-
# This is because saving the objects in the DB will give them an id,
237-
# and we neeed this id to be `None` when indexing the objects in ES.
238-
# ES will generate a unique id for each document.
239-
if html_files_to_index:
240-
index_objects(
241-
document=PageDocument,
242-
objects=html_files_to_index,
243-
index_name=search_index_name,
244-
# Pages are indexed in small chunks to avoid a
245-
# large payload that will probably timeout ES.
246-
chunk_size=100,
247-
)
248-
249-
# Remove old HTMLFiles from ElasticSearch
250-
remove_indexed_files(
251-
project_slug=version.project.slug,
252-
version_slug=version.slug,
253-
sync_id=sync_id,
254-
index_name=search_index_name,
255-
)
256-
257-
if html_files_to_save:
258-
HTMLFile.objects.bulk_create(html_files_to_save)
259-
260-
# Delete imported files from the previous build of the version.
261-
version.imported_files.exclude(build=sync_id).delete()
262-
263-
# This signal is used for purging the CDN.
264-
files_changed.send(
265-
sender=Project,
266-
project=version.project,
267-
version=version,
268-
)
269-
return sync_id

readthedocs/rtd_tests/tests/test_imported_file.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
from readthedocs.builds.constants import EXTERNAL
1010
from readthedocs.projects.models import HTMLFile, ImportedFile, Project
11-
from readthedocs.projects.tasks.search import _create_imported_files_and_search_index
11+
from readthedocs.projects.tasks.search import _get_indexers, _process_files
1212
from readthedocs.search.documents import PageDocument
1313

1414
base_dir = os.path.dirname(os.path.dirname(__file__))
@@ -46,11 +46,12 @@ def _manage_imported_files(self, version, search_ranking=None, search_ignore=Non
4646
"""Helper function for the tests to create and sync ImportedFiles."""
4747
search_ranking = search_ranking or {}
4848
search_ignore = search_ignore or []
49-
return _create_imported_files_and_search_index(
49+
indexers = _get_indexers(
5050
version=version,
5151
search_ranking=search_ranking,
5252
search_ignore=search_ignore,
5353
)
54+
return _process_files(version=version, indexers=indexers)
5455

5556
def _copy_storage_dir(self):
5657
"""Copy the test directory (rtd_tests/files) to storage"""

0 commit comments

Comments
 (0)