Skip to content

Commit 3fc1fc7

Browse files
authored
Search: improve results for simple queries (#7194)
* Search: improve results for simple queries SimpleQueryString don't allow us to set an implicit value for fuzziness, but is still useful for advanced queries. This allows us to support both.
1 parent 4946529 commit 3fc1fc7

File tree

6 files changed

+129
-29
lines changed

6 files changed

+129
-29
lines changed

readthedocs/projects/models.py

+18-12
Original file line numberDiff line numberDiff line change
@@ -4,58 +4,59 @@
44
import logging
55
import os
66
import re
7+
from shlex import quote
78
from urllib.parse import urlparse
89

910
from allauth.socialaccount.providers import registry as allauth_registry
1011
from django.conf import settings
12+
from django.conf.urls import include
1113
from django.contrib.auth.models import User
1214
from django.core.files.storage import get_storage_class
1315
from django.db import models
1416
from django.db.models import Prefetch
15-
from django.urls import reverse, re_path
16-
from django.conf.urls import include
17+
from django.urls import re_path, reverse
1718
from django.utils.functional import cached_property
1819
from django.utils.translation import ugettext_lazy as _
19-
from django_extensions.db.models import TimeStampedModel
2020
from django.views import defaults
21-
from shlex import quote
21+
from django_extensions.db.models import TimeStampedModel
2222
from taggit.managers import TaggableManager
2323

2424
from readthedocs.api.v2.client import api
25-
from readthedocs.builds.constants import LATEST, STABLE, INTERNAL, EXTERNAL
25+
from readthedocs.builds.constants import EXTERNAL, INTERNAL, LATEST, STABLE
26+
from readthedocs.constants import pattern_opts
2627
from readthedocs.core.resolver import resolve, resolve_domain
2728
from readthedocs.core.utils import broadcast, slugify
28-
from readthedocs.constants import pattern_opts
2929
from readthedocs.doc_builder.constants import DOCKER_LIMITS
3030
from readthedocs.projects import constants
3131
from readthedocs.projects.exceptions import ProjectConfigurationError
3232
from readthedocs.projects.managers import HTMLFileManager
3333
from readthedocs.projects.querysets import (
3434
ChildRelatedProjectQuerySet,
3535
FeatureQuerySet,
36+
HTMLFileQuerySet,
3637
ProjectQuerySet,
3738
RelatedProjectQuerySet,
38-
HTMLFileQuerySet,
3939
)
4040
from readthedocs.projects.templatetags.projects_tags import sort_version_aware
4141
from readthedocs.projects.validators import (
4242
validate_domain_name,
4343
validate_repository_url,
4444
)
4545
from readthedocs.projects.version_handling import determine_stable_version
46-
from readthedocs.search.parse_json import process_file, process_mkdocs_index_file
46+
from readthedocs.search.parse_json import (
47+
process_file,
48+
process_mkdocs_index_file,
49+
)
4750
from readthedocs.vcs_support.backends import backend_cls
4851
from readthedocs.vcs_support.utils import Lock, NonBlockingLock
4952

50-
5153
from .constants import (
52-
MEDIA_TYPES,
53-
MEDIA_TYPE_PDF,
5454
MEDIA_TYPE_EPUB,
5555
MEDIA_TYPE_HTMLZIP,
56+
MEDIA_TYPE_PDF,
57+
MEDIA_TYPES,
5658
)
5759

58-
5960
log = logging.getLogger(__name__)
6061

6162

@@ -1615,6 +1616,7 @@ def add_features(sender, **kwargs):
16151616
SPHINX_PARALLEL = 'sphinx_parallel'
16161617
USE_SPHINX_BUILDERS = 'use_sphinx_builders'
16171618
DEDUPLICATE_BUILDS = 'deduplicate_builds'
1619+
DEFAULT_TO_FUZZY_SEARCH = 'default_to_fuzzy_search'
16181620

16191621
FEATURES = (
16201622
(USE_SPHINX_LATEST, _('Use latest version of Sphinx')),
@@ -1728,6 +1730,10 @@ def add_features(sender, **kwargs):
17281730
DEDUPLICATE_BUILDS,
17291731
_('Mark duplicated builds as NOOP to be skipped by builders'),
17301732
),
1733+
(
1734+
DEFAULT_TO_FUZZY_SEARCH,
1735+
_('Default to fuzzy search for simple search queries'),
1736+
)
17311737
)
17321738

17331739
projects = models.ManyToManyField(

readthedocs/search/api.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from readthedocs.api.v2.permissions import IsAuthorizedToViewVersion
1818
from readthedocs.builds.models import Version
1919
from readthedocs.projects.constants import MKDOCS, SPHINX_HTMLDIR
20-
from readthedocs.projects.models import Project
20+
from readthedocs.projects.models import Feature, Project
2121
from readthedocs.search import tasks, utils
2222
from readthedocs.search.faceted_search import PageSearch
2323

@@ -349,6 +349,7 @@ def get_queryset(self):
349349
user=self.request.user,
350350
# We use a permission class to control authorization
351351
filter_by_user=False,
352+
use_advanced_query=not self._get_project().has_feature(Feature.DEFAULT_TO_FUZZY_SEARCH),
352353
)
353354
return queryset
354355

readthedocs/search/faceted_search.py

+60-13
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from elasticsearch import Elasticsearch
55
from elasticsearch_dsl import FacetedSearch, TermsFacet
66
from elasticsearch_dsl.faceted_search import NestedFacet
7-
from elasticsearch_dsl.query import Bool, Match, Nested, SimpleQueryString
7+
from elasticsearch_dsl.query import Bool, MultiMatch, Nested, SimpleQueryString
88

99
from readthedocs.core.utils.extend import SettingsOverrideObject
1010
from readthedocs.search.documents import PageDocument, ProjectDocument
@@ -27,17 +27,21 @@ class RTDFacetedSearch(FacetedSearch):
2727
'post_tags': ['</span>'],
2828
}
2929

30-
def __init__(self, query=None, filters=None, user=None, **kwargs):
30+
def __init__(self, query=None, filters=None, user=None, use_advanced_query=True, **kwargs):
3131
"""
3232
Pass in a user in order to filter search results by privacy.
3333
34+
If `use_advanced_query` is `True`,
35+
force to always use `SimpleQueryString` for the text query object.
36+
3437
.. warning::
3538
3639
The `self.user` and `self.filter_by_user` attributes
3740
aren't currently used on the .org, but are used on the .com.
3841
"""
3942
self.user = user
4043
self.filter_by_user = kwargs.pop('filter_by_user', True)
44+
self.use_advanced_query = use_advanced_query
4145

4246
# Hack a fix to our broken connection pooling
4347
# This creates a new connection on every request,
@@ -55,6 +59,49 @@ def __init__(self, query=None, filters=None, user=None, **kwargs):
5559
}
5660
super().__init__(query=query, filters=valid_filters, **kwargs)
5761

62+
def _get_text_query(self, *, query, fields, operator):
63+
"""
64+
Returns a text query object according to the query.
65+
66+
- SimpleQueryString: Provides a syntax to let advanced users manipulate
67+
the results explicitly.
68+
- MultiMatch: Allows us to have more control over the results
69+
(like fuzziness) to provide a better experience for simple queries.
70+
"""
71+
if self.use_advanced_query or self._is_advanced_query(query):
72+
query_string = SimpleQueryString(
73+
query=query,
74+
fields=fields,
75+
default_operator=operator
76+
)
77+
else:
78+
query_string = MultiMatch(
79+
query=query,
80+
fields=fields,
81+
operator=operator,
82+
fuzziness="AUTO",
83+
)
84+
return query_string
85+
86+
def _is_advanced_query(self, query):
87+
"""
88+
Check if query looks like to be using the syntax from a simple query string.
89+
90+
.. note::
91+
92+
We don't check if the syntax is valid.
93+
The tokens used aren't very common in a normal query, so checking if
94+
the query contains any of them should be enough to determinate if
95+
it's an advanced query.
96+
97+
Simple query syntax:
98+
99+
https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html#simple-query-string-syntax
100+
"""
101+
tokens = {'+', '|', '-', '"', '*', '(', ')', '~'}
102+
query_tokens = set(query)
103+
return not tokens.isdisjoint(query_tokens)
104+
58105
def query(self, search, query):
59106
"""
60107
Add query part to ``search`` when needed.
@@ -71,10 +118,11 @@ def query(self, search, query):
71118

72119
# need to search for both 'and' and 'or' operations
73120
# the score of and should be higher as it satisfies both or and and
74-
75121
for operator in self.operators:
76-
query_string = SimpleQueryString(
77-
query=query, fields=self.fields, default_operator=operator
122+
query_string = self._get_text_query(
123+
query=query,
124+
fields=self.fields,
125+
operator=operator,
78126
)
79127
all_queries.append(query_string)
80128

@@ -135,13 +183,12 @@ def query(self, search, query):
135183

136184
# match query for the title (of the page) field.
137185
for operator in self.operators:
138-
all_queries.append(
139-
SimpleQueryString(
140-
query=query,
141-
fields=self.fields,
142-
default_operator=operator
143-
)
186+
query_string = self._get_text_query(
187+
query=query,
188+
fields=self.fields,
189+
operator=operator,
144190
)
191+
all_queries.append(query_string)
145192

146193
# nested query for search in sections
147194
sections_nested_query = self.generate_nested_query(
@@ -186,10 +233,10 @@ def generate_nested_query(self, query, path, fields, inner_hits):
186233
queries = []
187234

188235
for operator in self.operators:
189-
query_string = SimpleQueryString(
236+
query_string = self._get_text_query(
190237
query=query,
191238
fields=fields,
192-
default_operator=operator
239+
operator=operator,
193240
)
194241
queries.append(query_string)
195242

readthedocs/search/tests/test_api.py

+47-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
SPHINX_HTMLDIR,
1515
SPHINX_SINGLEHTML,
1616
)
17-
from readthedocs.projects.models import HTMLFile, Project
17+
from readthedocs.projects.models import HTMLFile, Project, Feature
1818
from readthedocs.search.api import PageSearchAPIView
1919
from readthedocs.search.documents import PageDocument
2020
from readthedocs.search.tests.utils import (
@@ -451,6 +451,52 @@ def test_search_correct_link_for_index_page_subdirectory_htmldir_projects(self,
451451
assert result['project'] == project.slug
452452
assert result['link'].endswith('en/latest/guides/')
453453

454+
def test_search_advanced_query_detection(self, api_client):
455+
project = Project.objects.get(slug='docs')
456+
feature, _ = Feature.objects.get_or_create(
457+
feature_id=Feature.DEFAULT_TO_FUZZY_SEARCH,
458+
)
459+
project.feature_set.add(feature)
460+
project.save()
461+
version = project.versions.all().first()
462+
463+
# Query with a typo should return results
464+
search_params = {
465+
'project': project.slug,
466+
'version': version.slug,
467+
'q': 'indx',
468+
}
469+
resp = self.get_search(api_client, search_params)
470+
assert resp.status_code == 200
471+
472+
results = resp.data['results']
473+
assert len(results) > 0
474+
assert 'Index' in results[0]['title']
475+
476+
# Query with a typo, but we want to match that
477+
search_params = {
478+
'project': project.slug,
479+
'version': version.slug,
480+
'q': '"indx"',
481+
}
482+
resp = self.get_search(api_client, search_params)
483+
assert resp.status_code == 200
484+
485+
assert len(resp.data['results']) == 0
486+
487+
# Exact query still works
488+
search_params = {
489+
'project': project.slug,
490+
'version': version.slug,
491+
'q': '"index"',
492+
}
493+
resp = self.get_search(api_client, search_params)
494+
assert resp.status_code == 200
495+
496+
results = resp.data['results']
497+
assert len(results) > 0
498+
assert 'Index' in results[0]['title']
499+
454500

455501
class TestDocumentSearch(BaseTestDocumentSearch):
456502

readthedocs/search/tests/test_xss.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
class TestXSS:
99

1010
def test_facted_page_xss(self, client, project):
11-
query = 'XSS'
11+
query = '"XSS"'
1212
page_search = PageSearch(query=query)
1313
results = page_search.execute()
1414
expected = """

0 commit comments

Comments
 (0)