Skip to content

Search: use multi-fields for Wildcard queries #7613

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 9, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 70 additions & 9 deletions readthedocs/search/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from django.conf import settings
from django_elasticsearch_dsl import Document, Index, fields
from elasticsearch import Elasticsearch
from elasticsearch_dsl.field import Keyword

from readthedocs.projects.models import HTMLFile, Project

Expand All @@ -17,6 +18,12 @@
log = logging.getLogger(__name__)


# TODO: send this upstream (elasticsearch_dsl and django_elasticsearch_dsl).
class WildcardField(Keyword, fields.DEDField):

name = 'wildcard'


class RTDDocTypeMixin:

def update(self, *args, **kwargs):
Expand All @@ -31,6 +38,13 @@ def update(self, *args, **kwargs):
@project_index.document
class ProjectDocument(RTDDocTypeMixin, Document):

"""
Document representation of a Project.

We use multi-fields to be able to perform other kind of queries over the same field.
``raw`` fields are used for Wildcard queries.
"""
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💯


# Metadata
url = fields.TextField(attr='get_absolute_url')
users = fields.NestedField(
Expand All @@ -41,11 +55,30 @@ class ProjectDocument(RTDDocTypeMixin, Document):
)
language = fields.KeywordField()

name = fields.TextField(
attr='name',
fields={
'raw': WildcardField(),
},
)
slug = fields.TextField(
attr='slug',
fields={
'raw': WildcardField(),
},
)
description = fields.TextField(
attr='description',
fields={
'raw': WildcardField(),
},
)

modified_model_field = 'modified_date'

class Django:
model = Project
fields = ('name', 'slug', 'description')
fields = []
ignore_signals = True


Expand All @@ -61,6 +94,11 @@ class PageDocument(RTDDocTypeMixin, Document):
instead of [python.submodule].
See more at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-analyzers.html # noqa

We use multi-fields to be able to perform other kind of queries over the same field.
``raw`` fields are used for Wildcard queries.

https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-analyzers.html

Some text fields use the ``with_positions_offsets`` term vector,
this is to have faster highlighting on big documents.
See more at https://www.elastic.co/guide/en/elasticsearch/reference/7.9/term-vector.html
Expand All @@ -74,13 +112,27 @@ class PageDocument(RTDDocTypeMixin, Document):
rank = fields.IntegerField()

# Searchable content
title = fields.TextField(attr='processed_json.title')
title = fields.TextField(
attr='processed_json.title',
fields={
'raw': WildcardField(),
},
)
sections = fields.NestedField(
attr='processed_json.sections',
properties={
'id': fields.KeywordField(),
'title': fields.TextField(),
'content': fields.TextField(term_vector='with_positions_offsets'),
'title': fields.TextField(
fields={
'raw': WildcardField(),
},
),
'content': fields.TextField(
term_vector='with_positions_offsets',
fields={
'raw': WildcardField(),
},
),
}
)
domains = fields.NestedField(
Expand All @@ -92,11 +144,20 @@ class PageDocument(RTDDocTypeMixin, Document):

# For showing in the search result
'type_display': fields.TextField(),
'docstrings': fields.TextField(term_vector='with_positions_offsets'),

# Simple analyzer breaks on `.`,
# otherwise search results are too strict for this use case
'name': fields.TextField(analyzer='simple'),
'docstrings': fields.TextField(
term_vector='with_positions_offsets',
fields={
'raw': WildcardField(),
},
),
'name': fields.TextField(
# Simple analyzer breaks on `.`,
# otherwise search results are too strict for this use case
analyzer='simple',
fields={
'raw': WildcardField(),
},
),
}
)

Expand Down
48 changes: 32 additions & 16 deletions readthedocs/search/faceted_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,21 +87,13 @@ def _get_queries(self, *, query, fields):
"""
Get a list of query objects according to the query.

If the query is a *single term* (a single word)
we try to match partial words and substrings
(available only with the DEFAULT_TO_FUZZY_SEARCH feature flag).

If the query is a phrase or contains the syntax from a simple query string,
we use the SimpleQueryString query.
If the query is a single term we try to match partial words and substrings
(available only with the DEFAULT_TO_FUZZY_SEARCH feature flag),
otherwise we use the SimpleQueryString query.
"""
is_single_term = (
not self.use_advanced_query and
query and len(query.split()) <= 1 and
not self._is_advanced_query(query)
)
get_queries_function = (
self._get_single_term_queries
if is_single_term
if self._is_single_term(query)
else self._get_text_queries
)

Expand Down Expand Up @@ -150,6 +142,7 @@ def _get_single_term_queries(self, query, fields):
The score of "and" should be higher as it satisfies both "or" and "and".

We use the Wildcard query with the query surrounded by ``*`` to match substrings.
We use the raw fields (Wildcard fields) instead of the normal field for performance.

For valid options, see:

Expand All @@ -164,8 +157,9 @@ def _get_single_term_queries(self, query, fields):
)
queries.append(query_string)
for field in fields:
# Remove boosting from the field
field = re.sub(r'\^.*$', '', field)
# Remove boosting from the field,
# and query from the raw field.
field = re.sub(r'\^.*$', '.raw', field)
kwargs = {
field: {'value': f'*{query}*'},
}
Expand All @@ -188,6 +182,21 @@ def _get_fuzzy_query(self, *, query, fields, operator):
prefix_length=1,
)

def _is_single_term(self, query):
"""
Check if the query is a single term.

A query is a single term if it is a single word,
if it doesn't contain the syntax from a simple query string,
and if `self.use_advanced_query` is False.
"""
is_single_term = (
not self.use_advanced_query and
query and len(query.split()) <= 1 and
not self._is_advanced_query(query)
)
return is_single_term

def _is_advanced_query(self, query):
"""
Check if query looks like to be using the syntax from a simple query string.
Expand Down Expand Up @@ -333,11 +342,18 @@ def _get_nested_query(self, *, query, path, fields):
fields=fields,
)

raw_fields = (
raw_fields = [
# Remove boosting from the field
re.sub(r'\^.*$', '', field)
for field in fields
)
]

# Highlight from the raw fields too, if it is a single term.
if self._is_single_term(query):
raw_fields.extend([
re.sub(r'\^.*$', '.raw', field)
for field in fields
])

highlight = dict(
self._highlight_options,
Expand Down
40 changes: 28 additions & 12 deletions readthedocs/search/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,28 @@
VersionData = namedtuple('VersionData', ['slug', 'docs_url', 'doctype'])


def get_raw_field(obj, field, default=None):
"""Get the ``raw`` version of this field or fallback to the original field."""
return (
getattr(obj, f'{field}.raw', default)
or getattr(obj, field, default)
)


class ProjectHighlightSerializer(serializers.Serializer):

name = serializers.ListField(child=serializers.CharField(), default=list)
slug = serializers.ListField(child=serializers.CharField(), default=list)
description = serializers.ListField(child=serializers.CharField(), default=list)
name = serializers.SerializerMethodField()
slug = serializers.SerializerMethodField()
description = serializers.SerializerMethodField()

def get_name(self, obj):
return list(get_raw_field(obj, 'name', []))

def get_slug(self, obj):
return list(get_raw_field(obj, 'slug', []))

def get_description(self, obj):
return list(get_raw_field(obj, 'description', []))


class ProjectSearchSerializer(serializers.Serializer):
Expand All @@ -43,7 +60,10 @@ class ProjectSearchSerializer(serializers.Serializer):

class PageHighlightSerializer(serializers.Serializer):

title = serializers.ListField(child=serializers.CharField(), default=list)
title = serializers.SerializerMethodField()

def get_title(self, obj):
return list(get_raw_field(obj, 'title', []))


class PageSearchSerializer(serializers.Serializer):
Expand Down Expand Up @@ -151,12 +171,10 @@ class DomainHighlightSerializer(serializers.Serializer):
content = serializers.SerializerMethodField()

def get_name(self, obj):
name = getattr(obj, 'domains.name', [])
return list(name)
return list(get_raw_field(obj, 'domains.name', []))

def get_content(self, obj):
docstring = getattr(obj, 'domains.docstrings', [])
return list(docstring)
return list(get_raw_field(obj, 'domains.docstrings', []))


class DomainSearchSerializer(serializers.Serializer):
Expand All @@ -175,12 +193,10 @@ class SectionHighlightSerializer(serializers.Serializer):
content = serializers.SerializerMethodField()

def get_title(self, obj):
title = getattr(obj, 'sections.title', [])
return list(title)
return list(get_raw_field(obj, 'sections.title', []))

def get_content(self, obj):
content = getattr(obj, 'sections.content', [])
return list(content)
return list(get_raw_field(obj, 'sections.content', []))


class SectionSearchSerializer(serializers.Serializer):
Expand Down
7 changes: 6 additions & 1 deletion readthedocs/search/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,12 @@ def test_search_single_query(self, api_client):

results = resp.data['results']
assert len(results) > 0
assert 'Index' in results[0]['title']
assert 'Support' in results[0]['title']
# find is more closer than index, so is listed first.
highlights = results[0]['blocks'][0]['highlights']
assert '<span>find</span>' in highlights['content'][0]

assert 'Index' in results[1]['title']

# Query with a partial word, but we want to match that
search_params = {
Expand Down