Search: improve results for simple queries (#7194)

stsewd · web-flow · commit 3fc1fc7d1c96 · 2020-06-16T12:59:10.000-05:00
* Search: improve results for simple queries

SimpleQueryString don't allow us to set an implicit value for fuzziness,
but is still useful for advanced queries.
This allows us to support both.
diff --git a/common b/common
@@ -1 +1 @@
-Subproject commit 042949ff11321a9d044efdf41b0620089aac1981
+Subproject commit aa711563ca288dd9a5860d283a78ed7b54425b9d
diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py
@@ -4,58 +4,59 @@
 import logging
 import os
 import re
+from shlex import quote
 from urllib.parse import urlparse
 
 from allauth.socialaccount.providers import registry as allauth_registry
 from django.conf import settings
+from django.conf.urls import include
 from django.contrib.auth.models import User
 from django.core.files.storage import get_storage_class
 from django.db import models
 from django.db.models import Prefetch
-from django.urls import reverse, re_path
-from django.conf.urls import include
+from django.urls import re_path, reverse
 from django.utils.functional import cached_property
 from django.utils.translation import ugettext_lazy as _
-from django_extensions.db.models import TimeStampedModel
 from django.views import defaults
-from shlex import quote
+from django_extensions.db.models import TimeStampedModel
 from taggit.managers import TaggableManager
 
 from readthedocs.api.v2.client import api
-from readthedocs.builds.constants import LATEST, STABLE, INTERNAL, EXTERNAL
+from readthedocs.builds.constants import EXTERNAL, INTERNAL, LATEST, STABLE
+from readthedocs.constants import pattern_opts
 from readthedocs.core.resolver import resolve, resolve_domain
 from readthedocs.core.utils import broadcast, slugify
-from readthedocs.constants import pattern_opts
 from readthedocs.doc_builder.constants import DOCKER_LIMITS
 from readthedocs.projects import constants
 from readthedocs.projects.exceptions import ProjectConfigurationError
 from readthedocs.projects.managers import HTMLFileManager
 from readthedocs.projects.querysets import (
     ChildRelatedProjectQuerySet,
     FeatureQuerySet,
+    HTMLFileQuerySet,
     ProjectQuerySet,
     RelatedProjectQuerySet,
-    HTMLFileQuerySet,
 )
 from readthedocs.projects.templatetags.projects_tags import sort_version_aware
 from readthedocs.projects.validators import (
     validate_domain_name,
     validate_repository_url,
 )
 from readthedocs.projects.version_handling import determine_stable_version
-from readthedocs.search.parse_json import process_file, process_mkdocs_index_file
+from readthedocs.search.parse_json import (
+    process_file,
+    process_mkdocs_index_file,
+)
 from readthedocs.vcs_support.backends import backend_cls
 from readthedocs.vcs_support.utils import Lock, NonBlockingLock
 
-
 from .constants import (
-    MEDIA_TYPES,
-    MEDIA_TYPE_PDF,
     MEDIA_TYPE_EPUB,
     MEDIA_TYPE_HTMLZIP,
+    MEDIA_TYPE_PDF,
+    MEDIA_TYPES,
 )
 
-
 log = logging.getLogger(__name__)
 
 
@@ -1615,6 +1616,7 @@ def add_features(sender, **kwargs):
     SPHINX_PARALLEL = 'sphinx_parallel'
     USE_SPHINX_BUILDERS = 'use_sphinx_builders'
     DEDUPLICATE_BUILDS = 'deduplicate_builds'
+    DEFAULT_TO_FUZZY_SEARCH = 'default_to_fuzzy_search'
 
     FEATURES = (
         (USE_SPHINX_LATEST, _('Use latest version of Sphinx')),
@@ -1728,6 +1730,10 @@ def add_features(sender, **kwargs):
             DEDUPLICATE_BUILDS,
             _('Mark duplicated builds as NOOP to be skipped by builders'),
         ),
+        (
+            DEFAULT_TO_FUZZY_SEARCH,
+            _('Default to fuzzy search for simple search queries'),
+        )
     )
 
     projects = models.ManyToManyField(
diff --git a/readthedocs/search/api.py b/readthedocs/search/api.py
@@ -17,7 +17,7 @@
 from readthedocs.api.v2.permissions import IsAuthorizedToViewVersion
 from readthedocs.builds.models import Version
 from readthedocs.projects.constants import MKDOCS, SPHINX_HTMLDIR
-from readthedocs.projects.models import Project
+from readthedocs.projects.models import Feature, Project
 from readthedocs.search import tasks, utils
 from readthedocs.search.faceted_search import PageSearch
 
@@ -349,6 +349,7 @@ def get_queryset(self):
             user=self.request.user,
             # We use a permission class to control authorization
             filter_by_user=False,
+            use_advanced_query=not self._get_project().has_feature(Feature.DEFAULT_TO_FUZZY_SEARCH),
         )
         return queryset
 
diff --git a/readthedocs/search/faceted_search.py b/readthedocs/search/faceted_search.py
@@ -4,7 +4,7 @@
 from elasticsearch import Elasticsearch
 from elasticsearch_dsl import FacetedSearch, TermsFacet
 from elasticsearch_dsl.faceted_search import NestedFacet
-from elasticsearch_dsl.query import Bool, Match, Nested, SimpleQueryString
+from elasticsearch_dsl.query import Bool, MultiMatch, Nested, SimpleQueryString
 
 from readthedocs.core.utils.extend import SettingsOverrideObject
 from readthedocs.search.documents import PageDocument, ProjectDocument
@@ -27,17 +27,21 @@ class RTDFacetedSearch(FacetedSearch):
         'post_tags': ['</span>'],
     }
 
-    def __init__(self, query=None, filters=None, user=None, **kwargs):
+    def __init__(self, query=None, filters=None, user=None, use_advanced_query=True, **kwargs):
         """
         Pass in a user in order to filter search results by privacy.
 
+        If `use_advanced_query` is `True`,
+        force to always use `SimpleQueryString` for the text query object.
+
         .. warning::
 
             The `self.user` and `self.filter_by_user` attributes
             aren't currently used on the .org, but are used on the .com.
         """
         self.user = user
         self.filter_by_user = kwargs.pop('filter_by_user', True)
+        self.use_advanced_query = use_advanced_query
 
         # Hack a fix to our broken connection pooling
         # This creates a new connection on every request,
@@ -55,6 +59,49 @@ def __init__(self, query=None, filters=None, user=None, **kwargs):
         }
         super().__init__(query=query, filters=valid_filters, **kwargs)
 
+    def _get_text_query(self, *, query, fields, operator):
+        """
+        Returns a text query object according to the query.
+
+        - SimpleQueryString: Provides a syntax to let advanced users manipulate
+          the results explicitly.
+        - MultiMatch: Allows us to have more control over the results
+          (like fuzziness) to provide a better experience for simple queries.
+        """
+        if self.use_advanced_query or self._is_advanced_query(query):
+            query_string = SimpleQueryString(
+                query=query,
+                fields=fields,
+                default_operator=operator
+            )
+        else:
+            query_string = MultiMatch(
+                query=query,
+                fields=fields,
+                operator=operator,
+                fuzziness="AUTO",
+            )
+        return query_string
+
+    def _is_advanced_query(self, query):
+        """
+        Check if query looks like to be using the syntax from a simple query string.
+
+        .. note::
+
+           We don't check if the syntax is valid.
+           The tokens used aren't very common in a normal query, so checking if
+           the query contains any of them should be enough to determinate if
+           it's an advanced query.
+
+        Simple query syntax:
+
+        https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html#simple-query-string-syntax
+        """
+        tokens = {'+', '|', '-', '"', '*', '(', ')', '~'}
+        query_tokens = set(query)
+        return not tokens.isdisjoint(query_tokens)
+
     def query(self, search, query):
         """
         Add query part to ``search`` when needed.
@@ -71,10 +118,11 @@ def query(self, search, query):
 
         # need to search for both 'and' and 'or' operations
         # the score of and should be higher as it satisfies both or and and
-
         for operator in self.operators:
-            query_string = SimpleQueryString(
-                query=query, fields=self.fields, default_operator=operator
+            query_string = self._get_text_query(
+                query=query,
+                fields=self.fields,
+                operator=operator,
             )
             all_queries.append(query_string)
 
@@ -135,13 +183,12 @@ def query(self, search, query):
 
         # match query for the title (of the page) field.
         for operator in self.operators:
-            all_queries.append(
-                SimpleQueryString(
-                    query=query,
-                    fields=self.fields,
-                    default_operator=operator
-                )
+            query_string = self._get_text_query(
+                query=query,
+                fields=self.fields,
+                operator=operator,
             )
+            all_queries.append(query_string)
 
         # nested query for search in sections
         sections_nested_query = self.generate_nested_query(
@@ -186,10 +233,10 @@ def generate_nested_query(self, query, path, fields, inner_hits):
         queries = []
 
         for operator in self.operators:
-            query_string = SimpleQueryString(
+            query_string = self._get_text_query(
                 query=query,
                 fields=fields,
-                default_operator=operator
+                operator=operator,
             )
             queries.append(query_string)
 
diff --git a/readthedocs/search/tests/test_api.py b/readthedocs/search/tests/test_api.py
@@ -14,7 +14,7 @@
     SPHINX_HTMLDIR,
     SPHINX_SINGLEHTML,
 )
-from readthedocs.projects.models import HTMLFile, Project
+from readthedocs.projects.models import HTMLFile, Project, Feature
 from readthedocs.search.api import PageSearchAPIView
 from readthedocs.search.documents import PageDocument
 from readthedocs.search.tests.utils import (
@@ -451,6 +451,52 @@ def test_search_correct_link_for_index_page_subdirectory_htmldir_projects(self,
         assert result['project'] == project.slug
         assert result['link'].endswith('en/latest/guides/')
 
+    def test_search_advanced_query_detection(self, api_client):
+        project = Project.objects.get(slug='docs')
+        feature, _ = Feature.objects.get_or_create(
+            feature_id=Feature.DEFAULT_TO_FUZZY_SEARCH,
+        )
+        project.feature_set.add(feature)
+        project.save()
+        version = project.versions.all().first()
+
+        # Query with a typo should return results
+        search_params = {
+            'project': project.slug,
+            'version': version.slug,
+            'q': 'indx',
+        }
+        resp = self.get_search(api_client, search_params)
+        assert resp.status_code == 200
+
+        results = resp.data['results']
+        assert len(results) > 0
+        assert 'Index' in results[0]['title']
+
+        # Query with a typo, but we want to match that
+        search_params = {
+            'project': project.slug,
+            'version': version.slug,
+            'q': '"indx"',
+        }
+        resp = self.get_search(api_client, search_params)
+        assert resp.status_code == 200
+
+        assert len(resp.data['results']) == 0
+
+        # Exact query still works
+        search_params = {
+            'project': project.slug,
+            'version': version.slug,
+            'q': '"index"',
+        }
+        resp = self.get_search(api_client, search_params)
+        assert resp.status_code == 200
+
+        results = resp.data['results']
+        assert len(results) > 0
+        assert 'Index' in results[0]['title']
+
 
 class TestDocumentSearch(BaseTestDocumentSearch):
 
diff --git a/readthedocs/search/tests/test_xss.py b/readthedocs/search/tests/test_xss.py
@@ -8,7 +8,7 @@
 class TestXSS:
 
     def test_facted_page_xss(self, client, project):
-        query = 'XSS'
+        query = '"XSS"'
         page_search = PageSearch(query=query)
         results = page_search.execute()
         expected = """