Search: use multi-fields for Wildcard queries (#7613)

stsewd · web-flow · commit b523a780f5c6 · 2021-06-09T11:49:46.000-05:00
Wildcard queries are slow (on .org it returns 502, on .com since the db is small it works just fine). https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html#query-dsl-allow-expensive-queries These type of queries can be optimized by using the Wildcard field https://www.elastic.co/guide/en/elasticsearch/reference/current/keyword.html#wildcard-field-type. Currently that field isn't implemented in the ES dependencies, but is just a subclass of the Keyword field (I'll see if I can send a PR upstream). As we still want to make use of the SimpleString queries, I'm using the multi-fields feature. This change is editing the index, so we need to rebuild the index and re-index. To test it locally: - `inv docker.manage 'search_index --rebuild'`. - `inv docker.manage reindex_elasticsearch`. - Enable the `DEFAULT_TO_FUZZY_SEARCH` feature flag on a project. I'm not sure how to write tests for this one, and we can't test if this is really fast in production... But testing it locally works and gives better results for both, sections and domains!
diff --git a/readthedocs/search/documents.py b/readthedocs/search/documents.py
@@ -3,6 +3,7 @@
 from django.conf import settings
 from django_elasticsearch_dsl import Document, Index, fields
 from elasticsearch import Elasticsearch
+from elasticsearch_dsl.field import Keyword
 
 from readthedocs.projects.models import HTMLFile, Project
 
@@ -17,6 +18,12 @@
 log = logging.getLogger(__name__)
 
 
+# TODO: send this upstream (elasticsearch_dsl and django_elasticsearch_dsl).
+class WildcardField(Keyword, fields.DEDField):
+
+    name = 'wildcard'
+
+
 class RTDDocTypeMixin:
 
     def update(self, *args, **kwargs):
@@ -31,6 +38,13 @@ def update(self, *args, **kwargs):
 @project_index.document
 class ProjectDocument(RTDDocTypeMixin, Document):
 
+    """
+    Document representation of a Project.
+
+    We use multi-fields to be able to perform other kind of queries over the same field.
+    ``raw`` fields are used for Wildcard queries.
+    """
+
     # Metadata
     url = fields.TextField(attr='get_absolute_url')
     users = fields.NestedField(
@@ -41,11 +55,30 @@ class ProjectDocument(RTDDocTypeMixin, Document):
     )
     language = fields.KeywordField()
 
+    name = fields.TextField(
+        attr='name',
+        fields={
+            'raw': WildcardField(),
+        },
+    )
+    slug = fields.TextField(
+        attr='slug',
+        fields={
+            'raw': WildcardField(),
+        },
+    )
+    description = fields.TextField(
+        attr='description',
+        fields={
+            'raw': WildcardField(),
+        },
+    )
+
     modified_model_field = 'modified_date'
 
     class Django:
         model = Project
-        fields = ('name', 'slug', 'description')
+        fields = []
         ignore_signals = True
 
 
@@ -61,6 +94,11 @@ class PageDocument(RTDDocTypeMixin, Document):
     instead of [python.submodule].
     See more at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-analyzers.html  # noqa
 
+    We use multi-fields to be able to perform other kind of queries over the same field.
+    ``raw`` fields are used for Wildcard queries.
+
+    https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-analyzers.html
+
     Some text fields use the ``with_positions_offsets`` term vector,
     this is to have faster highlighting on big documents.
     See more at https://www.elastic.co/guide/en/elasticsearch/reference/7.9/term-vector.html
@@ -75,13 +113,27 @@ class PageDocument(RTDDocTypeMixin, Document):
     rank = fields.IntegerField()
 
     # Searchable content
-    title = fields.TextField(attr='processed_json.title')
+    title = fields.TextField(
+        attr='processed_json.title',
+        fields={
+            'raw': WildcardField(),
+        },
+    )
     sections = fields.NestedField(
         attr='processed_json.sections',
         properties={
             'id': fields.KeywordField(),
-            'title': fields.TextField(),
-            'content': fields.TextField(term_vector='with_positions_offsets'),
+            'title': fields.TextField(
+                fields={
+                    'raw': WildcardField(),
+                },
+            ),
+            'content': fields.TextField(
+                term_vector='with_positions_offsets',
+                fields={
+                    'raw': WildcardField(),
+                },
+            ),
         }
     )
     domains = fields.NestedField(
@@ -93,11 +145,20 @@ class PageDocument(RTDDocTypeMixin, Document):
 
             # For showing in the search result
             'type_display': fields.TextField(),
-            'docstrings': fields.TextField(term_vector='with_positions_offsets'),
-
-            # Simple analyzer breaks on `.`,
-            # otherwise search results are too strict for this use case
-            'name': fields.TextField(analyzer='simple'),
+            'docstrings': fields.TextField(
+                term_vector='with_positions_offsets',
+                fields={
+                    'raw': WildcardField(),
+                },
+            ),
+            'name': fields.TextField(
+                # Simple analyzer breaks on `.`,
+                # otherwise search results are too strict for this use case
+                analyzer='simple',
+                fields={
+                    'raw': WildcardField(),
+                },
+            ),
         }
     )
 
diff --git a/readthedocs/search/faceted_search.py b/readthedocs/search/faceted_search.py
@@ -87,21 +87,13 @@ def _get_queries(self, *, query, fields):
         """
         Get a list of query objects according to the query.
 
-        If the query is a *single term* (a single word)
-        we try to match partial words and substrings
-        (available only with the DEFAULT_TO_FUZZY_SEARCH feature flag).
-
-        If the query is a phrase or contains the syntax from a simple query string,
-        we use the SimpleQueryString query.
+        If the query is a single term we try to match partial words and substrings
+        (available only with the DEFAULT_TO_FUZZY_SEARCH feature flag),
+        otherwise we use the SimpleQueryString query.
         """
-        is_single_term = (
-            not self.use_advanced_query and
-            query and len(query.split()) <= 1 and
-            not self._is_advanced_query(query)
-        )
         get_queries_function = (
             self._get_single_term_queries
-            if is_single_term
+            if self._is_single_term(query)
             else self._get_text_queries
         )
 
@@ -150,6 +142,7 @@ def _get_single_term_queries(self, query, fields):
         The score of "and" should be higher as it satisfies both "or" and "and".
 
         We use the Wildcard query with the query surrounded by ``*`` to match substrings.
+        We use the raw fields (Wildcard fields) instead of the normal field for performance.
 
         For valid options, see:
 
@@ -164,8 +157,9 @@ def _get_single_term_queries(self, query, fields):
             )
             queries.append(query_string)
         for field in fields:
-            # Remove boosting from the field
-            field = re.sub(r'\^.*$', '', field)
+            # Remove boosting from the field,
+            # and query from the raw field.
+            field = re.sub(r'\^.*$', '.raw', field)
             kwargs = {
                 field: {'value': f'*{query}*'},
             }
@@ -188,6 +182,21 @@ def _get_fuzzy_query(self, *, query, fields, operator):
             prefix_length=1,
         )
 
+    def _is_single_term(self, query):
+        """
+        Check if the query is a single term.
+
+        A query is a single term if it is a single word,
+        if it doesn't contain the syntax from a simple query string,
+        and if `self.use_advanced_query` is False.
+        """
+        is_single_term = (
+            not self.use_advanced_query and
+            query and len(query.split()) <= 1 and
+            not self._is_advanced_query(query)
+        )
+        return is_single_term
+
     def _is_advanced_query(self, query):
         """
         Check if query looks like to be using the syntax from a simple query string.
@@ -333,11 +342,18 @@ def _get_nested_query(self, *, query, path, fields):
             fields=fields,
         )
 
-        raw_fields = (
+        raw_fields = [
             # Remove boosting from the field
             re.sub(r'\^.*$', '', field)
             for field in fields
-        )
+        ]
+
+        # Highlight from the raw fields too, if it is a single term.
+        if self._is_single_term(query):
+            raw_fields.extend([
+                re.sub(r'\^.*$', '.raw', field)
+                for field in fields
+            ])
 
         highlight = dict(
             self._highlight_options,
diff --git a/readthedocs/search/serializers.py b/readthedocs/search/serializers.py
@@ -25,11 +25,28 @@
 VersionData = namedtuple('VersionData', ['slug', 'docs_url'])
 
 
+def get_raw_field(obj, field, default=None):
+    """Get the ``raw`` version of this field or fallback to the original field."""
+    return (
+        getattr(obj, f'{field}.raw', default)
+        or getattr(obj, field, default)
+    )
+
+
 class ProjectHighlightSerializer(serializers.Serializer):
 
-    name = serializers.ListField(child=serializers.CharField(), default=list)
-    slug = serializers.ListField(child=serializers.CharField(), default=list)
-    description = serializers.ListField(child=serializers.CharField(), default=list)
+    name = serializers.SerializerMethodField()
+    slug = serializers.SerializerMethodField()
+    description = serializers.SerializerMethodField()
+
+    def get_name(self, obj):
+        return list(get_raw_field(obj, 'name', []))
+
+    def get_slug(self, obj):
+        return list(get_raw_field(obj, 'slug', []))
+
+    def get_description(self, obj):
+        return list(get_raw_field(obj, 'description', []))
 
 
 class ProjectSearchSerializer(serializers.Serializer):
@@ -44,7 +61,10 @@ class ProjectSearchSerializer(serializers.Serializer):
 
 class PageHighlightSerializer(serializers.Serializer):
 
-    title = serializers.ListField(child=serializers.CharField(), default=list)
+    title = serializers.SerializerMethodField()
+
+    def get_title(self, obj):
+        return list(get_raw_field(obj, 'title', []))
 
 
 class PageSearchSerializer(serializers.Serializer):
@@ -166,12 +186,10 @@ class DomainHighlightSerializer(serializers.Serializer):
     content = serializers.SerializerMethodField()
 
     def get_name(self, obj):
-        name = getattr(obj, 'domains.name', [])
-        return list(name)
+        return list(get_raw_field(obj, 'domains.name', []))
 
     def get_content(self, obj):
-        docstring = getattr(obj, 'domains.docstrings', [])
-        return list(docstring)
+        return list(get_raw_field(obj, 'domains.docstrings', []))
 
 
 class DomainSearchSerializer(serializers.Serializer):
@@ -190,12 +208,10 @@ class SectionHighlightSerializer(serializers.Serializer):
     content = serializers.SerializerMethodField()
 
     def get_title(self, obj):
-        title = getattr(obj, 'sections.title', [])
-        return list(title)
+        return list(get_raw_field(obj, 'sections.title', []))
 
     def get_content(self, obj):
-        content = getattr(obj, 'sections.content', [])
-        return list(content)
+        return list(get_raw_field(obj, 'sections.content', []))
 
 
 class SectionSearchSerializer(serializers.Serializer):
diff --git a/readthedocs/search/tests/test_api.py b/readthedocs/search/tests/test_api.py
@@ -605,7 +605,12 @@ def test_search_single_query(self, api_client):
 
         results = resp.data['results']
         assert len(results) > 0
-        assert 'Index' in results[0]['title']
+        assert 'Support' in results[0]['title']
+        # find is more closer than index, so is listed first.
+        highlights = results[0]['blocks'][0]['highlights']
+        assert '<span>find</span>' in highlights['content'][0]
+
+        assert 'Index' in results[1]['title']
 
         # Query with a partial word, but we want to match that
         search_params = {