Skip to content

Commit b523a78

Browse files
authored
Search: use multi-fields for Wildcard queries (#7613)
Wildcard queries are slow (on .org it returns 502, on .com since the db is small it works just fine). https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html#query-dsl-allow-expensive-queries These type of queries can be optimized by using the Wildcard field https://www.elastic.co/guide/en/elasticsearch/reference/current/keyword.html#wildcard-field-type. Currently that field isn't implemented in the ES dependencies, but is just a subclass of the Keyword field (I'll see if I can send a PR upstream). As we still want to make use of the SimpleString queries, I'm using the multi-fields feature. This change is editing the index, so we need to rebuild the index and re-index. To test it locally: - `inv docker.manage 'search_index --rebuild'`. - `inv docker.manage reindex_elasticsearch`. - Enable the `DEFAULT_TO_FUZZY_SEARCH` feature flag on a project. I'm not sure how to write tests for this one, and we can't test if this is really fast in production... But testing it locally works and gives better results for both, sections and domains!
1 parent 75955e1 commit b523a78

File tree

4 files changed

+136
-38
lines changed

4 files changed

+136
-38
lines changed

readthedocs/search/documents.py

+70-9
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from django.conf import settings
44
from django_elasticsearch_dsl import Document, Index, fields
55
from elasticsearch import Elasticsearch
6+
from elasticsearch_dsl.field import Keyword
67

78
from readthedocs.projects.models import HTMLFile, Project
89

@@ -17,6 +18,12 @@
1718
log = logging.getLogger(__name__)
1819

1920

21+
# TODO: send this upstream (elasticsearch_dsl and django_elasticsearch_dsl).
22+
class WildcardField(Keyword, fields.DEDField):
23+
24+
name = 'wildcard'
25+
26+
2027
class RTDDocTypeMixin:
2128

2229
def update(self, *args, **kwargs):
@@ -31,6 +38,13 @@ def update(self, *args, **kwargs):
3138
@project_index.document
3239
class ProjectDocument(RTDDocTypeMixin, Document):
3340

41+
"""
42+
Document representation of a Project.
43+
44+
We use multi-fields to be able to perform other kind of queries over the same field.
45+
``raw`` fields are used for Wildcard queries.
46+
"""
47+
3448
# Metadata
3549
url = fields.TextField(attr='get_absolute_url')
3650
users = fields.NestedField(
@@ -41,11 +55,30 @@ class ProjectDocument(RTDDocTypeMixin, Document):
4155
)
4256
language = fields.KeywordField()
4357

58+
name = fields.TextField(
59+
attr='name',
60+
fields={
61+
'raw': WildcardField(),
62+
},
63+
)
64+
slug = fields.TextField(
65+
attr='slug',
66+
fields={
67+
'raw': WildcardField(),
68+
},
69+
)
70+
description = fields.TextField(
71+
attr='description',
72+
fields={
73+
'raw': WildcardField(),
74+
},
75+
)
76+
4477
modified_model_field = 'modified_date'
4578

4679
class Django:
4780
model = Project
48-
fields = ('name', 'slug', 'description')
81+
fields = []
4982
ignore_signals = True
5083

5184

@@ -61,6 +94,11 @@ class PageDocument(RTDDocTypeMixin, Document):
6194
instead of [python.submodule].
6295
See more at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-analyzers.html # noqa
6396
97+
We use multi-fields to be able to perform other kind of queries over the same field.
98+
``raw`` fields are used for Wildcard queries.
99+
100+
https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-analyzers.html
101+
64102
Some text fields use the ``with_positions_offsets`` term vector,
65103
this is to have faster highlighting on big documents.
66104
See more at https://www.elastic.co/guide/en/elasticsearch/reference/7.9/term-vector.html
@@ -75,13 +113,27 @@ class PageDocument(RTDDocTypeMixin, Document):
75113
rank = fields.IntegerField()
76114

77115
# Searchable content
78-
title = fields.TextField(attr='processed_json.title')
116+
title = fields.TextField(
117+
attr='processed_json.title',
118+
fields={
119+
'raw': WildcardField(),
120+
},
121+
)
79122
sections = fields.NestedField(
80123
attr='processed_json.sections',
81124
properties={
82125
'id': fields.KeywordField(),
83-
'title': fields.TextField(),
84-
'content': fields.TextField(term_vector='with_positions_offsets'),
126+
'title': fields.TextField(
127+
fields={
128+
'raw': WildcardField(),
129+
},
130+
),
131+
'content': fields.TextField(
132+
term_vector='with_positions_offsets',
133+
fields={
134+
'raw': WildcardField(),
135+
},
136+
),
85137
}
86138
)
87139
domains = fields.NestedField(
@@ -93,11 +145,20 @@ class PageDocument(RTDDocTypeMixin, Document):
93145

94146
# For showing in the search result
95147
'type_display': fields.TextField(),
96-
'docstrings': fields.TextField(term_vector='with_positions_offsets'),
97-
98-
# Simple analyzer breaks on `.`,
99-
# otherwise search results are too strict for this use case
100-
'name': fields.TextField(analyzer='simple'),
148+
'docstrings': fields.TextField(
149+
term_vector='with_positions_offsets',
150+
fields={
151+
'raw': WildcardField(),
152+
},
153+
),
154+
'name': fields.TextField(
155+
# Simple analyzer breaks on `.`,
156+
# otherwise search results are too strict for this use case
157+
analyzer='simple',
158+
fields={
159+
'raw': WildcardField(),
160+
},
161+
),
101162
}
102163
)
103164

readthedocs/search/faceted_search.py

+32-16
Original file line numberDiff line numberDiff line change
@@ -87,21 +87,13 @@ def _get_queries(self, *, query, fields):
8787
"""
8888
Get a list of query objects according to the query.
8989
90-
If the query is a *single term* (a single word)
91-
we try to match partial words and substrings
92-
(available only with the DEFAULT_TO_FUZZY_SEARCH feature flag).
93-
94-
If the query is a phrase or contains the syntax from a simple query string,
95-
we use the SimpleQueryString query.
90+
If the query is a single term we try to match partial words and substrings
91+
(available only with the DEFAULT_TO_FUZZY_SEARCH feature flag),
92+
otherwise we use the SimpleQueryString query.
9693
"""
97-
is_single_term = (
98-
not self.use_advanced_query and
99-
query and len(query.split()) <= 1 and
100-
not self._is_advanced_query(query)
101-
)
10294
get_queries_function = (
10395
self._get_single_term_queries
104-
if is_single_term
96+
if self._is_single_term(query)
10597
else self._get_text_queries
10698
)
10799

@@ -150,6 +142,7 @@ def _get_single_term_queries(self, query, fields):
150142
The score of "and" should be higher as it satisfies both "or" and "and".
151143
152144
We use the Wildcard query with the query surrounded by ``*`` to match substrings.
145+
We use the raw fields (Wildcard fields) instead of the normal field for performance.
153146
154147
For valid options, see:
155148
@@ -164,8 +157,9 @@ def _get_single_term_queries(self, query, fields):
164157
)
165158
queries.append(query_string)
166159
for field in fields:
167-
# Remove boosting from the field
168-
field = re.sub(r'\^.*$', '', field)
160+
# Remove boosting from the field,
161+
# and query from the raw field.
162+
field = re.sub(r'\^.*$', '.raw', field)
169163
kwargs = {
170164
field: {'value': f'*{query}*'},
171165
}
@@ -188,6 +182,21 @@ def _get_fuzzy_query(self, *, query, fields, operator):
188182
prefix_length=1,
189183
)
190184

185+
def _is_single_term(self, query):
186+
"""
187+
Check if the query is a single term.
188+
189+
A query is a single term if it is a single word,
190+
if it doesn't contain the syntax from a simple query string,
191+
and if `self.use_advanced_query` is False.
192+
"""
193+
is_single_term = (
194+
not self.use_advanced_query and
195+
query and len(query.split()) <= 1 and
196+
not self._is_advanced_query(query)
197+
)
198+
return is_single_term
199+
191200
def _is_advanced_query(self, query):
192201
"""
193202
Check if query looks like to be using the syntax from a simple query string.
@@ -333,11 +342,18 @@ def _get_nested_query(self, *, query, path, fields):
333342
fields=fields,
334343
)
335344

336-
raw_fields = (
345+
raw_fields = [
337346
# Remove boosting from the field
338347
re.sub(r'\^.*$', '', field)
339348
for field in fields
340-
)
349+
]
350+
351+
# Highlight from the raw fields too, if it is a single term.
352+
if self._is_single_term(query):
353+
raw_fields.extend([
354+
re.sub(r'\^.*$', '.raw', field)
355+
for field in fields
356+
])
341357

342358
highlight = dict(
343359
self._highlight_options,

readthedocs/search/serializers.py

+28-12
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,28 @@
2525
VersionData = namedtuple('VersionData', ['slug', 'docs_url'])
2626

2727

28+
def get_raw_field(obj, field, default=None):
29+
"""Get the ``raw`` version of this field or fallback to the original field."""
30+
return (
31+
getattr(obj, f'{field}.raw', default)
32+
or getattr(obj, field, default)
33+
)
34+
35+
2836
class ProjectHighlightSerializer(serializers.Serializer):
2937

30-
name = serializers.ListField(child=serializers.CharField(), default=list)
31-
slug = serializers.ListField(child=serializers.CharField(), default=list)
32-
description = serializers.ListField(child=serializers.CharField(), default=list)
38+
name = serializers.SerializerMethodField()
39+
slug = serializers.SerializerMethodField()
40+
description = serializers.SerializerMethodField()
41+
42+
def get_name(self, obj):
43+
return list(get_raw_field(obj, 'name', []))
44+
45+
def get_slug(self, obj):
46+
return list(get_raw_field(obj, 'slug', []))
47+
48+
def get_description(self, obj):
49+
return list(get_raw_field(obj, 'description', []))
3350

3451

3552
class ProjectSearchSerializer(serializers.Serializer):
@@ -44,7 +61,10 @@ class ProjectSearchSerializer(serializers.Serializer):
4461

4562
class PageHighlightSerializer(serializers.Serializer):
4663

47-
title = serializers.ListField(child=serializers.CharField(), default=list)
64+
title = serializers.SerializerMethodField()
65+
66+
def get_title(self, obj):
67+
return list(get_raw_field(obj, 'title', []))
4868

4969

5070
class PageSearchSerializer(serializers.Serializer):
@@ -166,12 +186,10 @@ class DomainHighlightSerializer(serializers.Serializer):
166186
content = serializers.SerializerMethodField()
167187

168188
def get_name(self, obj):
169-
name = getattr(obj, 'domains.name', [])
170-
return list(name)
189+
return list(get_raw_field(obj, 'domains.name', []))
171190

172191
def get_content(self, obj):
173-
docstring = getattr(obj, 'domains.docstrings', [])
174-
return list(docstring)
192+
return list(get_raw_field(obj, 'domains.docstrings', []))
175193

176194

177195
class DomainSearchSerializer(serializers.Serializer):
@@ -190,12 +208,10 @@ class SectionHighlightSerializer(serializers.Serializer):
190208
content = serializers.SerializerMethodField()
191209

192210
def get_title(self, obj):
193-
title = getattr(obj, 'sections.title', [])
194-
return list(title)
211+
return list(get_raw_field(obj, 'sections.title', []))
195212

196213
def get_content(self, obj):
197-
content = getattr(obj, 'sections.content', [])
198-
return list(content)
214+
return list(get_raw_field(obj, 'sections.content', []))
199215

200216

201217
class SectionSearchSerializer(serializers.Serializer):

readthedocs/search/tests/test_api.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -605,7 +605,12 @@ def test_search_single_query(self, api_client):
605605

606606
results = resp.data['results']
607607
assert len(results) > 0
608-
assert 'Index' in results[0]['title']
608+
assert 'Support' in results[0]['title']
609+
# find is more closer than index, so is listed first.
610+
highlights = results[0]['blocks'][0]['highlights']
611+
assert '<span>find</span>' in highlights['content'][0]
612+
613+
assert 'Index' in results[1]['title']
609614

610615
# Query with a partial word, but we want to match that
611616
search_params = {

0 commit comments

Comments
 (0)