readthedocs · stsewd · Jun 9, 2021 · Oct 19, 2020 · Oct 28, 2020 · Dec 10, 2020
diff --git a/readthedocs/search/documents.py b/readthedocs/search/documents.py
@@ -3,6 +3,7 @@
 from django.conf import settings
 from django_elasticsearch_dsl import Document, Index, fields
 from elasticsearch import Elasticsearch
+from elasticsearch_dsl.field import Keyword
 
 from readthedocs.projects.models import HTMLFile, Project
 
@@ -17,6 +18,12 @@
 log = logging.getLogger(__name__)
 
 
+# TODO: send this upstream (elasticsearch_dsl and django_elasticsearch_dsl).
+class WildcardField(Keyword, fields.DEDField):
+
+    name = 'wildcard'
+
+
 class RTDDocTypeMixin:
 
     def update(self, *args, **kwargs):
@@ -31,6 +38,13 @@ def update(self, *args, **kwargs):
 @project_index.document
 class ProjectDocument(RTDDocTypeMixin, Document):
 
+    """
+    Document representation of a Project.
+
+    We use multi-fields to be able to perform other kind of queries over the same field.
+    ``raw`` fields are used for Wildcard queries.
+    """
+
     # Metadata
     url = fields.TextField(attr='get_absolute_url')
     users = fields.NestedField(
@@ -41,11 +55,30 @@ class ProjectDocument(RTDDocTypeMixin, Document):
     )
     language = fields.KeywordField()
 
+    name = fields.TextField(
+        attr='name',
+        fields={
+            'raw': WildcardField(),
+        },
+    )
+    slug = fields.TextField(
+        attr='slug',
+        fields={
+            'raw': WildcardField(),
+        },
+    )
+    description = fields.TextField(
+        attr='description',
+        fields={
+            'raw': WildcardField(),
+        },
+    )
+
     modified_model_field = 'modified_date'
 
     class Django:
         model = Project
-        fields = ('name', 'slug', 'description')
+        fields = []
         ignore_signals = True
 
 
@@ -61,6 +94,11 @@ class PageDocument(RTDDocTypeMixin, Document):
     instead of [python.submodule].
     See more at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-analyzers.html  # noqa
 
+    We use multi-fields to be able to perform other kind of queries over the same field.
+    ``raw`` fields are used for Wildcard queries.
+
+    https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-analyzers.html
+
     Some text fields use the ``with_positions_offsets`` term vector,
     this is to have faster highlighting on big documents.
     See more at https://www.elastic.co/guide/en/elasticsearch/reference/7.9/term-vector.html
@@ -74,13 +112,27 @@ class PageDocument(RTDDocTypeMixin, Document):
     rank = fields.IntegerField()
 
     # Searchable content
-    title = fields.TextField(attr='processed_json.title')
+    title = fields.TextField(
+        attr='processed_json.title',
+        fields={
+            'raw': WildcardField(),
+        },
+    )
     sections = fields.NestedField(
         attr='processed_json.sections',
         properties={
             'id': fields.KeywordField(),
-            'title': fields.TextField(),
-            'content': fields.TextField(term_vector='with_positions_offsets'),
+            'title': fields.TextField(
+                fields={
+                    'raw': WildcardField(),
+                },
+            ),
+            'content': fields.TextField(
+                term_vector='with_positions_offsets',
+                fields={
+                    'raw': WildcardField(),
+                },
+            ),
         }
     )
     domains = fields.NestedField(
@@ -92,11 +144,20 @@ class PageDocument(RTDDocTypeMixin, Document):
 
             # For showing in the search result
             'type_display': fields.TextField(),
-            'docstrings': fields.TextField(term_vector='with_positions_offsets'),
-
-            # Simple analyzer breaks on `.`,
-            # otherwise search results are too strict for this use case
-            'name': fields.TextField(analyzer='simple'),
+            'docstrings': fields.TextField(
+                term_vector='with_positions_offsets',
+                fields={
+                    'raw': WildcardField(),
+                },
+            ),
+            'name': fields.TextField(
+                # Simple analyzer breaks on `.`,
+                # otherwise search results are too strict for this use case
+                analyzer='simple',
+                fields={
+                    'raw': WildcardField(),
+                },
+            ),
         }
     )
 

diff --git a/readthedocs/search/faceted_search.py b/readthedocs/search/faceted_search.py
@@ -87,21 +87,13 @@ def _get_queries(self, *, query, fields):
         """
         Get a list of query objects according to the query.
 
-        If the query is a *single term* (a single word)
-        we try to match partial words and substrings
-        (available only with the DEFAULT_TO_FUZZY_SEARCH feature flag).
-
-        If the query is a phrase or contains the syntax from a simple query string,
-        we use the SimpleQueryString query.
+        If the query is a single term we try to match partial words and substrings
+        (available only with the DEFAULT_TO_FUZZY_SEARCH feature flag),
+        otherwise we use the SimpleQueryString query.
         """
-        is_single_term = (
-            not self.use_advanced_query and
-            query and len(query.split()) <= 1 and
-            not self._is_advanced_query(query)
-        )
         get_queries_function = (
             self._get_single_term_queries
-            if is_single_term
+            if self._is_single_term(query)
             else self._get_text_queries
         )
 
@@ -150,6 +142,7 @@ def _get_single_term_queries(self, query, fields):
         The score of "and" should be higher as it satisfies both "or" and "and".
 
         We use the Wildcard query with the query surrounded by ``*`` to match substrings.
+        We use the raw fields (Wildcard fields) instead of the normal field for performance.
 
         For valid options, see:
 
@@ -164,8 +157,9 @@ def _get_single_term_queries(self, query, fields):
             )
             queries.append(query_string)
         for field in fields:
-            # Remove boosting from the field
-            field = re.sub(r'\^.*$', '', field)
+            # Remove boosting from the field,
+            # and query from the raw field.
+            field = re.sub(r'\^.*$', '.raw', field)
             kwargs = {
                 field: {'value': f'*{query}*'},
             }
@@ -188,6 +182,21 @@ def _get_fuzzy_query(self, *, query, fields, operator):
             prefix_length=1,
         )
 
+    def _is_single_term(self, query):
+        """
+        Check if the query is a single term.
+
+        A query is a single term if it is a single word,
+        if it doesn't contain the syntax from a simple query string,
+        and if `self.use_advanced_query` is False.
+        """
+        is_single_term = (
+            not self.use_advanced_query and
+            query and len(query.split()) <= 1 and
+            not self._is_advanced_query(query)
+        )
+        return is_single_term
+
     def _is_advanced_query(self, query):
         """
         Check if query looks like to be using the syntax from a simple query string.
@@ -333,11 +342,18 @@ def _get_nested_query(self, *, query, path, fields):
             fields=fields,
         )
 
-        raw_fields = (
+        raw_fields = [
             # Remove boosting from the field
             re.sub(r'\^.*$', '', field)
             for field in fields
-        )
+        ]
+
+        # Highlight from the raw fields too, if it is a single term.
+        if self._is_single_term(query):
+            raw_fields.extend([
+                re.sub(r'\^.*$', '.raw', field)
+                for field in fields
+            ])
 
         highlight = dict(
             self._highlight_options,

diff --git a/readthedocs/search/serializers.py b/readthedocs/search/serializers.py
@@ -24,11 +24,28 @@
 VersionData = namedtuple('VersionData', ['slug', 'docs_url', 'doctype'])
 
 
+def get_raw_field(obj, field, default=None):
+    """Get the ``raw`` version of this field or fallback to the original field."""
+    return (
+        getattr(obj, f'{field}.raw', default)
+        or getattr(obj, field, default)
+    )
+
+
 class ProjectHighlightSerializer(serializers.Serializer):
 
-    name = serializers.ListField(child=serializers.CharField(), default=list)
-    slug = serializers.ListField(child=serializers.CharField(), default=list)
-    description = serializers.ListField(child=serializers.CharField(), default=list)
+    name = serializers.SerializerMethodField()
+    slug = serializers.SerializerMethodField()
+    description = serializers.SerializerMethodField()
+
+    def get_name(self, obj):
+        return list(get_raw_field(obj, 'name', []))
+
+    def get_slug(self, obj):
+        return list(get_raw_field(obj, 'slug', []))
+
+    def get_description(self, obj):
+        return list(get_raw_field(obj, 'description', []))
 
 
 class ProjectSearchSerializer(serializers.Serializer):
@@ -43,7 +60,10 @@ class ProjectSearchSerializer(serializers.Serializer):
 
 class PageHighlightSerializer(serializers.Serializer):
 
-    title = serializers.ListField(child=serializers.CharField(), default=list)
+    title = serializers.SerializerMethodField()
+
+    def get_title(self, obj):
+        return list(get_raw_field(obj, 'title', []))
 
 
 class PageSearchSerializer(serializers.Serializer):
@@ -151,12 +171,10 @@ class DomainHighlightSerializer(serializers.Serializer):
     content = serializers.SerializerMethodField()
 
     def get_name(self, obj):
-        name = getattr(obj, 'domains.name', [])
-        return list(name)
+        return list(get_raw_field(obj, 'domains.name', []))
 
     def get_content(self, obj):
-        docstring = getattr(obj, 'domains.docstrings', [])
-        return list(docstring)
+        return list(get_raw_field(obj, 'domains.docstrings', []))
 
 
 class DomainSearchSerializer(serializers.Serializer):
@@ -175,12 +193,10 @@ class SectionHighlightSerializer(serializers.Serializer):
     content = serializers.SerializerMethodField()
 
     def get_title(self, obj):
-        title = getattr(obj, 'sections.title', [])
-        return list(title)
+        return list(get_raw_field(obj, 'sections.title', []))
 
     def get_content(self, obj):
-        content = getattr(obj, 'sections.content', [])
-        return list(content)
+        return list(get_raw_field(obj, 'sections.content', []))
 
 
 class SectionSearchSerializer(serializers.Serializer):

diff --git a/readthedocs/search/tests/test_api.py b/readthedocs/search/tests/test_api.py
@@ -577,7 +577,12 @@ def test_search_single_query(self, api_client):
 
         results = resp.data['results']
         assert len(results) > 0
-        assert 'Index' in results[0]['title']
+        assert 'Support' in results[0]['title']
+        # find is more closer than index, so is listed first.
+        highlights = results[0]['blocks'][0]['highlights']
+        assert '<span>find</span>' in highlights['content'][0]
+
+        assert 'Index' in results[1]['title']
 
         # Query with a partial word, but we want to match that
         search_params = {