Skip to content

Commit 224c894

Browse files
committed
Search: use multi-fields for Wildcard queries
Wildcard queries are slow (on .org it returns 502, on .com since the db is small it works just fine). https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html#query-dsl-allow-expensive-queries These type of queries can be optimized by using the Wildcard field https://www.elastic.co/guide/en/elasticsearch/reference/current/keyword.html#wildcard-field-type. Currently that field isn't implemented in the ES dependencies, but is just a subclass of the Keyword field (I'll see if I can send a PR upstream). As we still want to make use of the SimpleString queries, I'm using the multi-fields feature. This change is editing the index, so we need to rebuild the index and re-index. To test it locally: - `inv docker.manage 'search_index --rebuild'`. - `inv docker.manage reindex_elasticsearch`. - Enable the `DEFAULT_TO_FUZZY_SEARCH` feature flag on a project. I'm not sure how to write tests for this one, and we can't test if this is really fast in production... But testing it locally works and gives better results for both, sections and domains!
1 parent f732a82 commit 224c894

File tree

3 files changed

+121
-36
lines changed

3 files changed

+121
-36
lines changed

readthedocs/search/documents.py

+67-9
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from django.conf import settings
44
from django_elasticsearch_dsl import Document, Index, fields
55
from elasticsearch import Elasticsearch
6+
from elasticsearch_dsl.field import Keyword
67

78
from readthedocs.projects.models import HTMLFile, Project
89

@@ -17,6 +18,12 @@
1718
log = logging.getLogger(__name__)
1819

1920

21+
# TODO: send this upstream (elasticsearch_dsl and django_elasticsearch_dsl).
22+
class WildcardField(Keyword, fields.DEDField):
23+
24+
name = 'wildcard'
25+
26+
2027
class RTDDocTypeMixin:
2128

2229
def update(self, *args, **kwargs):
@@ -31,6 +38,13 @@ def update(self, *args, **kwargs):
3138
@project_index.document
3239
class ProjectDocument(RTDDocTypeMixin, Document):
3340

41+
"""
42+
Document representation of a Project.
43+
44+
We use multi-fields to be able to perform other kind of queries over the same field.
45+
``raw`` fields are used for Wildcard queries.
46+
"""
47+
3448
# Metadata
3549
url = fields.TextField(attr='get_absolute_url')
3650
users = fields.NestedField(
@@ -41,11 +55,30 @@ class ProjectDocument(RTDDocTypeMixin, Document):
4155
)
4256
language = fields.KeywordField()
4357

58+
name = fields.TextField(
59+
attr='name',
60+
fields={
61+
'raw': WildcardField(),
62+
},
63+
)
64+
slug = fields.TextField(
65+
attr='slug',
66+
fields={
67+
'raw': WildcardField(),
68+
},
69+
)
70+
description = fields.TextField(
71+
attr='description',
72+
fields={
73+
'raw': WildcardField(),
74+
},
75+
)
76+
4477
modified_model_field = 'modified_date'
4578

4679
class Django:
4780
model = Project
48-
fields = ('name', 'slug', 'description')
81+
fields = []
4982
ignore_signals = True
5083

5184

@@ -60,6 +93,9 @@ class PageDocument(RTDDocTypeMixin, Document):
6093
so a text like ``python.submodule`` will be broken like [python, submodule]
6194
instead of [python.submodule].
6295
96+
We use multi-fields to be able to perform other kind of queries over the same field.
97+
``raw`` fields are used for Wildcard queries.
98+
6399
https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-analyzers.html
64100
"""
65101

@@ -71,13 +107,26 @@ class PageDocument(RTDDocTypeMixin, Document):
71107
rank = fields.IntegerField()
72108

73109
# Searchable content
74-
title = fields.TextField(attr='processed_json.title')
110+
title = fields.TextField(
111+
attr='processed_json.title',
112+
fields={
113+
'raw': WildcardField(),
114+
},
115+
)
75116
sections = fields.NestedField(
76117
attr='processed_json.sections',
77118
properties={
78119
'id': fields.KeywordField(),
79-
'title': fields.TextField(),
80-
'content': fields.TextField(),
120+
'title': fields.TextField(
121+
fields={
122+
'raw': WildcardField(),
123+
},
124+
),
125+
'content': fields.TextField(
126+
fields={
127+
'raw': WildcardField(),
128+
},
129+
),
81130
}
82131
)
83132
domains = fields.NestedField(
@@ -89,11 +138,20 @@ class PageDocument(RTDDocTypeMixin, Document):
89138

90139
# For showing in the search result
91140
'type_display': fields.TextField(),
92-
'docstrings': fields.TextField(),
93-
94-
# Simple analyzer breaks on `.`,
95-
# otherwise search results are too strict for this use case
96-
'name': fields.TextField(analyzer='simple'),
141+
'docstrings': fields.TextField(
142+
fields={
143+
'raw': WildcardField(),
144+
},
145+
),
146+
147+
'name': fields.TextField(
148+
# Simple analyzer breaks on `.`,
149+
# otherwise search results are too strict for this use case
150+
analyzer='simple',
151+
fields={
152+
'raw': WildcardField(),
153+
},
154+
),
97155
}
98156
)
99157

readthedocs/search/faceted_search.py

+26-15
Original file line numberDiff line numberDiff line change
@@ -71,21 +71,13 @@ def _get_queries(self, *, query, fields):
7171
"""
7272
Get a list of query objects according to the query.
7373
74-
If the query is a *single term* (a single word)
75-
we try to match partial words and substrings
76-
(available only with the DEFAULT_TO_FUZZY_SEARCH feature flag).
77-
78-
If the query is a phrase or contains the syntax from a simple query string,
79-
we use the SimpleQueryString query.
74+
If the query is a single term we try to match partial words and substrings
75+
(available only with the DEFAULT_TO_FUZZY_SEARCH feature flag),
76+
otherwise we use the SimpleQueryString query.
8077
"""
81-
is_single_term = (
82-
not self.use_advanced_query and
83-
query and len(query.split()) <= 1 and
84-
not self._is_advanced_query(query)
85-
)
8678
get_queries_function = (
8779
self._get_single_term_queries
88-
if is_single_term
80+
if self._is_single_term(query)
8981
else self._get_text_queries
9082
)
9183

@@ -134,6 +126,7 @@ def _get_single_term_queries(self, query, fields):
134126
The score of "and" should be higher as it satisfies both "or" and "and".
135127
136128
We use the Wildcard query with the query surrounded by ``*`` to match substrings.
129+
We use the raw fields (Wildcard fields) instead of the normal field for performance.
137130
138131
For valid options, see:
139132
@@ -148,8 +141,9 @@ def _get_single_term_queries(self, query, fields):
148141
)
149142
queries.append(query_string)
150143
for field in fields:
151-
# Remove boosting from the field
152-
field = re.sub(r'\^.*$', '', field)
144+
# Remove boosting from the field,
145+
# and query from the raw field.
146+
field = re.sub(r'\^.*$', '.raw', field)
153147
kwargs = {
154148
field: {'value': f'*{query}*'},
155149
}
@@ -172,6 +166,21 @@ def _get_fuzzy_query(self, *, query, fields, operator):
172166
prefix_length=1,
173167
)
174168

169+
def _is_single_term(self, query):
170+
"""
171+
Check if the query is a single term.
172+
173+
A query is a single term if it is a single word,
174+
if it doesn't contain the syntax from a simple query string,
175+
and if `self.use_advanced_query` is False.
176+
"""
177+
is_single_term = (
178+
not self.use_advanced_query and
179+
query and len(query.split()) <= 1 and
180+
not self._is_advanced_query(query)
181+
)
182+
return is_single_term
183+
175184
def _is_advanced_query(self, query):
176185
"""
177186
Check if query looks like to be using the syntax from a simple query string.
@@ -293,9 +302,11 @@ def _get_nested_query(self, *, query, path, fields):
293302
fields=fields,
294303
)
295304

305+
# Highlight from the raw fields if it is a single term.
306+
replacement = '.raw' if self._is_single_term(query) else ''
296307
raw_fields = (
297308
# Remove boosting from the field
298-
re.sub(r'\^.*$', '', field)
309+
re.sub(r'\^.*$', replacement, field)
299310
for field in fields
300311
)
301312

readthedocs/search/serializers.py

+28-12
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,28 @@
2424
ProjectData = namedtuple('ProjectData', ['docs_url', 'version_doctype'])
2525

2626

27+
def get_raw_field(obj, field, default=None):
28+
"""Get the ``raw`` version of this field or fallback to the original field."""
29+
return (
30+
getattr(obj, f'{field}.raw', default)
31+
or getattr(obj, field, default)
32+
)
33+
34+
2735
class ProjectHighlightSerializer(serializers.Serializer):
2836

29-
name = serializers.ListField(child=serializers.CharField(), default=list)
30-
slug = serializers.ListField(child=serializers.CharField(), default=list)
31-
description = serializers.ListField(child=serializers.CharField(), default=list)
37+
name = serializers.SerializerMethodField()
38+
slug = serializers.SerializerMethodField()
39+
description = serializers.SerializerMethodField()
40+
41+
def get_name(self, obj):
42+
return list(get_raw_field(obj, 'name', []))
43+
44+
def get_slug(self, obj):
45+
return list(get_raw_field(obj, 'slug', []))
46+
47+
def get_description(self, obj):
48+
return list(get_raw_field(obj, 'description', []))
3249

3350

3451
class ProjectSearchSerializer(serializers.Serializer):
@@ -42,7 +59,10 @@ class ProjectSearchSerializer(serializers.Serializer):
4259

4360
class PageHighlightSerializer(serializers.Serializer):
4461

45-
title = serializers.ListField(child=serializers.CharField(), default=list)
62+
title = serializers.SerializerMethodField()
63+
64+
def get_title(self, obj):
65+
return list(get_raw_field(obj, 'title', []))
4666

4767

4868
class PageSearchSerializer(serializers.Serializer):
@@ -146,12 +166,10 @@ class DomainHighlightSerializer(serializers.Serializer):
146166
content = serializers.SerializerMethodField()
147167

148168
def get_name(self, obj):
149-
name = getattr(obj, 'domains.name', [])
150-
return list(name)
169+
return list(get_raw_field(obj, 'domains.name', []))
151170

152171
def get_content(self, obj):
153-
docstring = getattr(obj, 'domains.docstrings', [])
154-
return list(docstring)
172+
return list(get_raw_field(obj, 'domains.docstrings', []))
155173

156174

157175
class DomainSearchSerializer(serializers.Serializer):
@@ -170,12 +188,10 @@ class SectionHighlightSerializer(serializers.Serializer):
170188
content = serializers.SerializerMethodField()
171189

172190
def get_title(self, obj):
173-
title = getattr(obj, 'sections.title', [])
174-
return list(title)
191+
return list(get_raw_field(obj, 'sections.title', []))
175192

176193
def get_content(self, obj):
177-
content = getattr(obj, 'sections.content', [])
178-
return list(content)
194+
return list(get_raw_field(obj, 'sections.content', []))
179195

180196

181197
class SectionSearchSerializer(serializers.Serializer):

0 commit comments

Comments
 (0)