|
14 | 14 | log = structlog.get_logger(__name__)
|
15 | 15 |
|
16 | 16 |
|
| 17 | +class Indexer: |
| 18 | + |
| 19 | + """ |
| 20 | + Base class for doing operations over each file from a build. |
| 21 | +
|
| 22 | + The process method should be implemented to apply the operation |
| 23 | + over each file, and the collect method should be implemented |
| 24 | + to collect the results of the operation after processing all files. |
| 25 | +
|
| 26 | + `sync_id` is used to differentiate the files from the current sync from the previous one. |
| 27 | + """ |
| 28 | + |
| 29 | + def process(self, html_file: HTMLFile, sync_id: int): |
| 30 | + raise NotImplementedError |
| 31 | + |
| 32 | + def collect(self, sync_id: int): |
| 33 | + raise NotImplementedError |
| 34 | + |
| 35 | + |
| 36 | +class SearchIndexer(Indexer): |
| 37 | + |
| 38 | + """ |
| 39 | + Index HTML files in ElasticSearch. |
| 40 | +
|
| 41 | + We respect the search ranking and ignore patterns defined in the project's search configuration. |
| 42 | +
|
| 43 | + If search_index_name is provided, it will be used as the search index name, |
| 44 | + otherwise the default one will be used. |
| 45 | + """ |
| 46 | + |
| 47 | + def __init__( |
| 48 | + self, |
| 49 | + project: Project, |
| 50 | + version: Version, |
| 51 | + search_ranking: dict[str, int], |
| 52 | + search_ignore: list[str], |
| 53 | + search_index_name: str | None = None, |
| 54 | + ): |
| 55 | + self.project = project |
| 56 | + self.version = version |
| 57 | + self.search_ranking = search_ranking |
| 58 | + self.search_ignore = search_ignore |
| 59 | + self._reversed_search_ranking = list(reversed(search_ranking.items())) |
| 60 | + self.search_index_name = search_index_name |
| 61 | + self._html_files_to_index = [] |
| 62 | + |
| 63 | + def process(self, html_file: HTMLFile, sync_id: int): |
| 64 | + for pattern in self.search_ignore: |
| 65 | + if fnmatch(html_file.path, pattern): |
| 66 | + return |
| 67 | + |
| 68 | + for pattern, rank in self._reversed_search_ranking: |
| 69 | + if fnmatch(html_file.path, pattern): |
| 70 | + html_file.rank = rank |
| 71 | + break |
| 72 | + |
| 73 | + self._html_files_to_index.append(html_file) |
| 74 | + |
| 75 | + def collect(self, sync_id: int): |
| 76 | + # Index new files in ElasticSearch. |
| 77 | + if self._html_files_to_index: |
| 78 | + index_objects( |
| 79 | + document=PageDocument, |
| 80 | + objects=self._html_files_to_index, |
| 81 | + index_name=self.search_index_name, |
| 82 | + # Pages are indexed in small chunks to avoid a |
| 83 | + # large payload that will probably timeout ES. |
| 84 | + chunk_size=100, |
| 85 | + ) |
| 86 | + |
| 87 | + # Remove old HTMLFiles from ElasticSearch. |
| 88 | + remove_indexed_files( |
| 89 | + project_slug=self.project.slug, |
| 90 | + version_slug=self.version.slug, |
| 91 | + sync_id=sync_id, |
| 92 | + index_name=self.search_index_name, |
| 93 | + ) |
| 94 | + |
| 95 | + |
| 96 | +class IndexFileIndexer(Indexer): |
| 97 | + |
| 98 | + """ |
| 99 | + Create imported files of interest in the DB. |
| 100 | +
|
| 101 | + We only save the top-level 404 file and index files, |
| 102 | + we don't need to keep track of all files. |
| 103 | + These files are queried by proxito instead of checking S3 (slow). |
| 104 | + """ |
| 105 | + |
| 106 | + def __init__(self, project: Project, version: Version): |
| 107 | + self.project = project |
| 108 | + self.version = version |
| 109 | + self._html_files_to_save = [] |
| 110 | + |
| 111 | + def process(self, html_file: HTMLFile, sync_id: int): |
| 112 | + if html_file.path == "404.html" or html_file.name == "index.html": |
| 113 | + self._html_files_to_save.append(html_file) |
| 114 | + |
| 115 | + def collect(self, sync_id: int): |
| 116 | + if self._html_files_to_save: |
| 117 | + HTMLFile.objects.bulk_create(self._html_files_to_save) |
| 118 | + |
| 119 | + # Delete imported files from the previous build of the version. |
| 120 | + self.version.imported_files.exclude(build=sync_id).delete() |
| 121 | + |
| 122 | + |
| 123 | +def _get_indexers(*, version, search_ranking, search_ignore, search_index_name=None): |
| 124 | + indexers = [] |
| 125 | + # NOTE: The search indexer must be before the index file indexer. |
| 126 | + # This is because saving the objects in the DB will give them an id, |
| 127 | + # and we neeed this id to be `None` when indexing the objects in ES. |
| 128 | + # ES will generate a unique id for each document. |
| 129 | + # NOTE: If the version is external, we don't create a search index for it. |
| 130 | + if not version.is_external: |
| 131 | + search_indexer = SearchIndexer( |
| 132 | + project=version.project, |
| 133 | + version=version, |
| 134 | + search_ranking=search_ranking, |
| 135 | + search_ignore=search_ignore, |
| 136 | + search_index_name=search_index_name, |
| 137 | + ) |
| 138 | + indexers.append(search_indexer) |
| 139 | + index_file_indexer = IndexFileIndexer( |
| 140 | + project=version.project, |
| 141 | + version=version, |
| 142 | + ) |
| 143 | + indexers.append(index_file_indexer) |
| 144 | + return indexers |
| 145 | + |
| 146 | + |
| 147 | +def _process_files(*, version: Version, indexers: list[Indexer]): |
| 148 | + storage_path = version.project.get_storage_path( |
| 149 | + type_="html", |
| 150 | + version_slug=version.slug, |
| 151 | + include_file=False, |
| 152 | + version_type=version.type, |
| 153 | + ) |
| 154 | + # A sync ID is a number different than the current `build` attribute (pending rename), |
| 155 | + # it's used to differentiate the files from the current sync from the previous one. |
| 156 | + # This is useful to easily delete the previous files from the DB and ES. |
| 157 | + # See https://github.com/readthedocs/readthedocs.org/issues/10734. |
| 158 | + imported_file_build_id = version.imported_files.values_list( |
| 159 | + "build", flat=True |
| 160 | + ).first() |
| 161 | + sync_id = imported_file_build_id + 1 if imported_file_build_id else 1 |
| 162 | + |
| 163 | + log.debug( |
| 164 | + "Using sync ID for search indexing", |
| 165 | + sync_id=sync_id, |
| 166 | + ) |
| 167 | + |
| 168 | + for root, __, filenames in build_media_storage.walk(storage_path): |
| 169 | + for filename in filenames: |
| 170 | + # We don't care about non-HTML files (for now?). |
| 171 | + if not filename.endswith(".html"): |
| 172 | + continue |
| 173 | + |
| 174 | + full_path = build_media_storage.join(root, filename) |
| 175 | + # Generate a relative path for storage similar to os.path.relpath |
| 176 | + relpath = full_path.removeprefix(storage_path).lstrip("/") |
| 177 | + |
| 178 | + html_file = HTMLFile( |
| 179 | + project=version.project, |
| 180 | + version=version, |
| 181 | + path=relpath, |
| 182 | + name=filename, |
| 183 | + # TODO: We are setting the commit field since it's required, |
| 184 | + # but it isn't used, and will be removed in the future |
| 185 | + # together with other fields. |
| 186 | + commit="unknown", |
| 187 | + build=sync_id, |
| 188 | + ) |
| 189 | + for indexer in indexers: |
| 190 | + indexer.process(html_file, sync_id) |
| 191 | + |
| 192 | + for indexer in indexers: |
| 193 | + indexer.collect(sync_id) |
| 194 | + |
| 195 | + # This signal is used for purging the CDN. |
| 196 | + files_changed.send( |
| 197 | + sender=Project, |
| 198 | + project=version.project, |
| 199 | + version=version, |
| 200 | + ) |
| 201 | + return sync_id |
| 202 | + |
| 203 | + |
17 | 204 | @app.task(queue="reindex")
|
18 | 205 | def index_build(build_id):
|
19 | 206 | """Create imported files and search index for the build."""
|
@@ -49,13 +236,14 @@ def index_build(build_id):
|
49 | 236 | search_ignore = search_config.get("ignore", [])
|
50 | 237 |
|
51 | 238 | try:
|
52 |
| - _create_imported_files_and_search_index( |
| 239 | + indexers = _get_indexers( |
53 | 240 | version=version,
|
54 | 241 | search_ranking=search_ranking,
|
55 | 242 | search_ignore=search_ignore,
|
56 | 243 | )
|
| 244 | + _process_files(version=version, indexers=indexers) |
57 | 245 | except Exception:
|
58 |
| - log.exception("Failed during creation of new files") |
| 246 | + log.exception("Failed to index build") |
59 | 247 |
|
60 | 248 |
|
61 | 249 | @app.task(queue="reindex")
|
@@ -99,14 +287,15 @@ def reindex_version(version_id, search_index_name=None):
|
99 | 287 | search_ignore = search_config.get("ignore", [])
|
100 | 288 |
|
101 | 289 | try:
|
102 |
| - _create_imported_files_and_search_index( |
| 290 | + indexers = _get_indexers( |
103 | 291 | version=version,
|
104 | 292 | search_ranking=search_ranking,
|
105 | 293 | search_ignore=search_ignore,
|
106 | 294 | search_index_name=search_index_name,
|
107 | 295 | )
|
| 296 | + _process_files(version=version, indexers=indexers) |
108 | 297 | except Exception:
|
109 |
| - log.exception("Failed during creation of new files") |
| 298 | + log.exception("Failed to re-index version") |
110 | 299 |
|
111 | 300 |
|
112 | 301 | @app.task(queue="reindex")
|
@@ -141,129 +330,3 @@ def remove_search_indexes(project_slug, version_slug=None):
|
141 | 330 | project_slug=project_slug,
|
142 | 331 | version_slug=version_slug,
|
143 | 332 | )
|
144 |
| - |
145 |
| - |
146 |
| -def _create_imported_files_and_search_index( |
147 |
| - *, version, search_ranking, search_ignore, search_index_name=None |
148 |
| -): |
149 |
| - """ |
150 |
| - Create imported files and search index for the build of the version. |
151 |
| -
|
152 |
| - If the version is external, we don't create a search index for it, only imported files. |
153 |
| - After the process is completed, we delete the files and search index that |
154 |
| - don't belong to the current build id. |
155 |
| -
|
156 |
| - :param search_index: If provided, it will be used as the search index name, |
157 |
| - otherwise the default one will be used. |
158 |
| - """ |
159 |
| - storage_path = version.project.get_storage_path( |
160 |
| - type_="html", |
161 |
| - version_slug=version.slug, |
162 |
| - include_file=False, |
163 |
| - version_type=version.type, |
164 |
| - ) |
165 |
| - # A sync ID is a number different than the current `build` attribute (pending rename), |
166 |
| - # it's used to differentiate the files from the current sync from the previous one. |
167 |
| - # This is useful to easily delete the previous files from the DB and ES. |
168 |
| - # See https://github.com/readthedocs/readthedocs.org/issues/10734. |
169 |
| - imported_file_build_id = version.imported_files.values_list( |
170 |
| - "build", flat=True |
171 |
| - ).first() |
172 |
| - sync_id = imported_file_build_id + 1 if imported_file_build_id else 1 |
173 |
| - |
174 |
| - log.debug( |
175 |
| - "Using sync ID for search indexing", |
176 |
| - sync_id=sync_id, |
177 |
| - ) |
178 |
| - |
179 |
| - html_files_to_index = [] |
180 |
| - html_files_to_save = [] |
181 |
| - reverse_rankings = list(reversed(search_ranking.items())) |
182 |
| - for root, __, filenames in build_media_storage.walk(storage_path): |
183 |
| - for filename in filenames: |
184 |
| - # We don't care about non-HTML files |
185 |
| - if not filename.endswith(".html"): |
186 |
| - continue |
187 |
| - |
188 |
| - full_path = build_media_storage.join(root, filename) |
189 |
| - |
190 |
| - # Generate a relative path for storage similar to os.path.relpath |
191 |
| - relpath = full_path.replace(storage_path, "", 1).lstrip("/") |
192 |
| - |
193 |
| - skip_search_index = False |
194 |
| - if version.is_external: |
195 |
| - # Never index files from external versions. |
196 |
| - skip_search_index = True |
197 |
| - else: |
198 |
| - for pattern in search_ignore: |
199 |
| - if fnmatch(relpath, pattern): |
200 |
| - skip_search_index = True |
201 |
| - break |
202 |
| - |
203 |
| - page_rank = 0 |
204 |
| - # If the file is ignored, we don't need to check for its ranking. |
205 |
| - if not skip_search_index: |
206 |
| - # Last pattern to match takes precedence |
207 |
| - for pattern, rank in reverse_rankings: |
208 |
| - if fnmatch(relpath, pattern): |
209 |
| - page_rank = rank |
210 |
| - break |
211 |
| - |
212 |
| - html_file = HTMLFile( |
213 |
| - project=version.project, |
214 |
| - version=version, |
215 |
| - path=relpath, |
216 |
| - name=filename, |
217 |
| - rank=page_rank, |
218 |
| - # TODO: We are setting the commit field since it's required, |
219 |
| - # but it isn't used, and will be removed in the future |
220 |
| - # together with other fields. |
221 |
| - commit="unknown", |
222 |
| - build=sync_id, |
223 |
| - ignore=skip_search_index, |
224 |
| - ) |
225 |
| - |
226 |
| - if not skip_search_index: |
227 |
| - html_files_to_index.append(html_file) |
228 |
| - |
229 |
| - # Create the imported file only if it's a top-level 404 file, |
230 |
| - # or if it's an index file. We don't need to keep track of all files. |
231 |
| - tryfiles = ["index.html"] |
232 |
| - if relpath == "404.html" or filename in tryfiles: |
233 |
| - html_files_to_save.append(html_file) |
234 |
| - |
235 |
| - # We first index the files in ES, and then save the objects in the DB. |
236 |
| - # This is because saving the objects in the DB will give them an id, |
237 |
| - # and we neeed this id to be `None` when indexing the objects in ES. |
238 |
| - # ES will generate a unique id for each document. |
239 |
| - if html_files_to_index: |
240 |
| - index_objects( |
241 |
| - document=PageDocument, |
242 |
| - objects=html_files_to_index, |
243 |
| - index_name=search_index_name, |
244 |
| - # Pages are indexed in small chunks to avoid a |
245 |
| - # large payload that will probably timeout ES. |
246 |
| - chunk_size=100, |
247 |
| - ) |
248 |
| - |
249 |
| - # Remove old HTMLFiles from ElasticSearch |
250 |
| - remove_indexed_files( |
251 |
| - project_slug=version.project.slug, |
252 |
| - version_slug=version.slug, |
253 |
| - sync_id=sync_id, |
254 |
| - index_name=search_index_name, |
255 |
| - ) |
256 |
| - |
257 |
| - if html_files_to_save: |
258 |
| - HTMLFile.objects.bulk_create(html_files_to_save) |
259 |
| - |
260 |
| - # Delete imported files from the previous build of the version. |
261 |
| - version.imported_files.exclude(build=sync_id).delete() |
262 |
| - |
263 |
| - # This signal is used for purging the CDN. |
264 |
| - files_changed.send( |
265 |
| - sender=Project, |
266 |
| - project=version.project, |
267 |
| - version=version, |
268 |
| - ) |
269 |
| - return sync_id |
|
0 commit comments