2
2
3
3
import structlog
4
4
5
- from readthedocs .builds .constants import EXTERNAL
5
+ from readthedocs .builds .constants import BUILD_STATE_FINISHED , EXTERNAL
6
6
from readthedocs .builds .models import Version
7
7
from readthedocs .projects .models import HTMLFile , ImportedFile , Project
8
8
from readthedocs .projects .signals import files_changed
9
- from readthedocs .search .utils import index_new_files , remove_indexed_files
9
+ from readthedocs .search .utils import remove_indexed_files
10
+ from django_elasticsearch_dsl .registries import registry
10
11
from readthedocs .storage import build_media_storage
11
12
from readthedocs .worker import app
12
13
@@ -43,7 +44,7 @@ def fileify(version_pk, commit, build, search_ranking, search_ignore):
43
44
_create_imported_files (
44
45
version = version ,
45
46
commit = commit ,
46
- build = build ,
47
+ build_id = build ,
47
48
search_ranking = search_ranking ,
48
49
search_ignore = search_ignore ,
49
50
)
@@ -65,9 +66,6 @@ def _sync_imported_files(version, build):
65
66
"""
66
67
project = version .project
67
68
68
- # Index new HTMLFiles to ElasticSearch
69
- index_new_files (model = HTMLFile , version = version , build = build )
70
-
71
69
# Remove old HTMLFiles from ElasticSearch
72
70
remove_indexed_files (
73
71
model = HTMLFile ,
@@ -95,7 +93,35 @@ def remove_search_indexes(project_slug, version_slug=None):
95
93
)
96
94
97
95
98
- def _create_imported_files (* , version , commit , build , search_ranking , search_ignore ):
96
+ def reindex_version (version ):
97
+ """
98
+ Reindex all files of this version.
99
+ """
100
+ latest_successful_build = version .builds .filter (
101
+ state = BUILD_STATE_FINISHED , success = True
102
+ ).order_by ("-date" ).first ()
103
+ # If the version doesn't have a successful
104
+ # build, we don't have files to index.
105
+ if not latest_successful_build :
106
+ return
107
+
108
+ search_ranking = []
109
+ search_ignore = []
110
+ build_config = latest_successful_build .config
111
+ if build_config :
112
+ search_ranking = build_config .search .ranking
113
+ search_ignore = build_config .search .ignore
114
+
115
+ _create_imported_files (
116
+ version = version ,
117
+ commit = latest_successful_build .commit ,
118
+ build_id = latest_successful_build .id ,
119
+ search_ranking = search_ranking ,
120
+ search_ignore = search_ignore ,
121
+ )
122
+
123
+
124
+ def _create_imported_files (* , version , commit , build_id , search_ranking , search_ignore ):
99
125
"""
100
126
Create imported files for version.
101
127
@@ -107,6 +133,9 @@ def _create_imported_files(*, version, commit, build, search_ranking, search_ign
107
133
storage_path = version .project .get_storage_path (
108
134
type_ = 'html' , version_slug = version .slug , include_file = False
109
135
)
136
+ html_files_to_index = []
137
+ html_files_to_save = []
138
+ reverse_rankings = reversed (list (search_ranking .items ()))
110
139
for root , __ , filenames in build_media_storage .walk (storage_path ):
111
140
for filename in filenames :
112
141
# We don't care about non-HTML files
@@ -118,34 +147,60 @@ def _create_imported_files(*, version, commit, build, search_ranking, search_ign
118
147
# Generate a relative path for storage similar to os.path.relpath
119
148
relpath = full_path .replace (storage_path , '' , 1 ).lstrip ('/' )
120
149
121
- page_rank = 0
122
- # Last pattern to match takes precedence
123
- # XXX: see if we can implement another type of precedence,
124
- # like the longest pattern.
125
- reverse_rankings = reversed (list (search_ranking .items ()))
126
- for pattern , rank in reverse_rankings :
127
- if fnmatch (relpath , pattern ):
128
- page_rank = rank
129
- break
130
-
131
150
ignore = False
132
- for pattern in search_ignore :
133
- if fnmatch (relpath , pattern ):
134
- ignore = True
135
- break
151
+ if version .is_external :
152
+ # Never index files from external versions.
153
+ ignore = True
154
+ else :
155
+ for pattern in search_ignore :
156
+ if fnmatch (relpath , pattern ):
157
+ ignore = True
158
+ break
136
159
137
- # Create imported files from new build
138
- HTMLFile .objects .create (
160
+ page_rank = 0
161
+ # If the file is ignored, we don't need to check for its ranking.
162
+ if not ignore :
163
+ # Last pattern to match takes precedence
164
+ # XXX: see if we can implement another type of precedence,
165
+ # like the longest pattern.
166
+ for pattern , rank in reverse_rankings :
167
+ if fnmatch (relpath , pattern ):
168
+ page_rank = rank
169
+ break
170
+
171
+ html_file = HTMLFile (
139
172
project = version .project ,
140
173
version = version ,
141
174
path = relpath ,
142
175
name = filename ,
143
176
rank = page_rank ,
144
177
commit = commit ,
145
- build = build ,
178
+ build = build_id ,
146
179
ignore = ignore ,
147
180
)
148
181
182
+ # Don't index files that are ignored.
183
+ if not ignore :
184
+ html_files_to_index .append (html_file )
185
+
186
+ # Create the imported file only if it's a top-level 404 file,
187
+ # or if it's an index file. We don't need to keep track of all files.
188
+ is_top_level_404_file = filename == "404.html" and root == storage_path
189
+ is_index_file = filename in ["index.html" , "README.html" ]
190
+ if is_top_level_404_file or is_index_file :
191
+ html_files_to_save .append (html_file )
192
+
193
+ # We first index the files in ES, and then save the objects in the DB.
194
+ # This is because saving the objects in the DB will give them an id,
195
+ # and we neeed this id to be `None` when indexing the objects in ES.
196
+ # ES will generate a unique id for each document.
197
+ if html_files_to_index :
198
+ document = list (registry .get_documents (models = [HTMLFile ]))[0 ]
199
+ document ().update (html_files_to_index )
200
+
201
+ if html_files_to_save :
202
+ HTMLFile .objects .bulk_create (html_files_to_save )
203
+
149
204
# This signal is used for purging the CDN.
150
205
files_changed .send (
151
206
sender = Project ,
0 commit comments