Skip to content

Search: allow to search on different versions of subprojects #7634

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Nov 19, 2020
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 24 additions & 10 deletions readthedocs/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1568,8 +1568,14 @@ def add_features(sender, **kwargs):
SKIP_SYNC_VERSIONS = 'skip_sync_versions'
CACHED_ENVIRONMENT = 'cached_environment'
LIMIT_CONCURRENT_BUILDS = 'limit_concurrent_builds'

# Search related features
DISABLE_SERVER_SIDE_SEARCH = 'disable_server_side_search'
ENABLE_MKDOCS_SERVER_SIDE_SEARCH = 'enable_mkdocs_server_side_search'
DEFAULT_TO_FUZZY_SEARCH = 'default_to_fuzzy_search'
INDEX_FROM_HTML_FILES = 'index_from_html_files'
SEARCH_SUBPROJECTS_ON_DEFAULT_VERSION = 'search_subprojects_on_default_version'

FORCE_SPHINX_FROM_VENV = 'force_sphinx_from_venv'
LIST_PACKAGES_INSTALLED_ENV = 'list_packages_installed_env'
VCS_REMOTE_LISTING = 'vcs_remote_listing'
Expand All @@ -1578,8 +1584,6 @@ def add_features(sender, **kwargs):
USE_SPHINX_BUILDERS = 'use_sphinx_builders'
DEDUPLICATE_BUILDS = 'deduplicate_builds'
USE_SPHINX_RTD_EXT_LATEST = 'rtd_sphinx_ext_latest'
DEFAULT_TO_FUZZY_SEARCH = 'default_to_fuzzy_search'
INDEX_FROM_HTML_FILES = 'index_from_html_files'
DONT_CREATE_INDEX = 'dont_create_index'
USE_NEW_PIP_RESOLVER = 'use_new_pip_resolver'
DONT_INSTALL_LATEST_PIP = 'dont_install_latest_pip'
Expand Down Expand Up @@ -1667,6 +1671,8 @@ def add_features(sender, **kwargs):
LIMIT_CONCURRENT_BUILDS,
_('Limit the amount of concurrent builds'),
),

# Search related features.
(
DISABLE_SERVER_SIDE_SEARCH,
_('Disable server side search'),
Expand All @@ -1675,6 +1681,22 @@ def add_features(sender, **kwargs):
ENABLE_MKDOCS_SERVER_SIDE_SEARCH,
_('Enable server side search for MkDocs projects'),
),
(
DEFAULT_TO_FUZZY_SEARCH,
_('Default to fuzzy search for simple search queries'),
),
(
INDEX_FROM_HTML_FILES,
_('Index content directly from html files instead or relying in other sources'),
),
(
SEARCH_SUBPROJECTS_ON_DEFAULT_VERSION,
_(
'When searching subprojects default to its default version if it doesn\'t '
'have the same version as the main project'
),
),

(
FORCE_SPHINX_FROM_VENV,
_('Force to use Sphinx from the current virtual environment'),
Expand Down Expand Up @@ -1710,14 +1732,6 @@ def add_features(sender, **kwargs):
USE_SPHINX_RTD_EXT_LATEST,
_('Use latest version of the Read the Docs Sphinx extension'),
),
(
DEFAULT_TO_FUZZY_SEARCH,
_('Default to fuzzy search for simple search queries'),
),
(
INDEX_FROM_HTML_FILES,
_('Index content directly from html files instead or relying in other sources'),
),
(
DONT_CREATE_INDEX,
_('Do not create index.md or README.rst if the project does not have one.'),
Expand Down
157 changes: 99 additions & 58 deletions readthedocs/search/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from readthedocs.search import tasks
from readthedocs.search.faceted_search import PageSearch

from .serializers import PageSearchSerializer
from .serializers import PageSearchSerializer, VersionData

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -185,67 +185,94 @@ def _validate_query_params(self):

def _get_all_projects_data(self):
"""
Return a dict containing the project slug and its version URL and version's doctype.

The dictionary contains the project and its subprojects. Each project's
slug is used as a key and a tuple with the documentation URL and doctype
from the version. Example:

{
"requests": (
"https://requests.readthedocs.io/en/latest/",
"sphinx",
),
"requests-oauth": (
"https://requests-oauth.readthedocs.io/en/latest/",
"sphinx_htmldir",
),
}
Return a dictionary of the project itself and all its subprojects.

:rtype: dict
"""
all_projects = self._get_all_projects()
version_slug = self._get_version().slug
project_urls = {}
for project in all_projects:
project_urls[project.slug] = project.get_docs_url(version_slug=version_slug)

versions_doctype = (
Version.objects
.filter(project__slug__in=project_urls.keys(), slug=version_slug)
.values_list('project__slug', 'documentation_type')
)
Example:

projects_data = {
project_slug: (project_urls[project_slug], doctype)
for project_slug, doctype in versions_doctype
}
return projects_data
.. code::

def _get_all_projects(self):
"""
Returns a list of the project itself and all its subprojects the user has permissions over.
{
"requests": VersionData(
"latest",
"sphinx",
"https://requests.readthedocs.io/en/latest/",
),
"requests-oauth": VersionData(
"latest",
"sphinx_htmldir",
"https://requests-oauth.readthedocs.io/en/latest/",
),
}

.. note:: The response is cached into the instance.

:rtype: list
:rtype: A dictionary of project slugs mapped to a `VersionData` object.
"""
cache_key = '__cached_projects_data'
projects_data = getattr(self, cache_key, None)
if projects_data is not None:
return projects_data

main_version = self._get_version()
main_project = self._get_project()

all_projects = [main_project]
projects_data = {
main_project.slug: VersionData(
slug=main_version.slug,
doctype=main_version.documentation_type,
docs_url=main_project.get_docs_url(version_slug=main_version.slug),
)
}

subprojects = Project.objects.filter(
superprojects__parent_id=main_project.id,
)
for project in subprojects:
version = (
Version.internal
.public(user=self.request.user, project=project, include_hidden=False)
.filter(slug=main_version.slug)
.first()
version = self._get_subproject_version(
version_slug=main_version.slug,
subproject=project,
)
if version:
all_projects.append(version.project)
return all_projects

# Fallback to the default version of the subproject.
if (
not version
and main_project.has_feature(Feature.SEARCH_SUBPROJECTS_ON_DEFAULT_VERSION)
and project.default_version
):
version = self._get_subproject_version(
version_slug=project.default_version,
subproject=project,
)

if version and self._has_permission(self.request.user, version):
url = project.get_docs_url(version_slug=version.slug)
projects_data[project.slug] = VersionData(
slug=version.slug,
doctype=version.documentation_type,
docs_url=url,
)

setattr(self, cache_key, projects_data)
return projects_data

def _get_subproject_version(self, version_slug, subproject):
"""Get a version from the subproject."""
return (
Version.internal
.public(user=self.request.user, project=subproject, include_hidden=False)
.filter(slug=version_slug)
.first()
)

def _has_permission(self, user, version):
"""
Check if `user` is authorized to access `version`.

The queryset from `_get_subproject_version` already filters public
projects. This is mainly to be overriden in .com to make use of
the auth backends in the proxied API.
"""
return True

def _record_query(self, response):
project_slug = self._get_project().slug
Expand Down Expand Up @@ -275,26 +302,40 @@ def get_queryset(self):
calling ``search.execute().hits``. This is why an DSL search object
is compatible with DRF's paginator.
"""
main_project = self._get_project()
main_version = self._get_version()
projects = {}
filters = {}
filters['project'] = [p.slug for p in self._get_all_projects()]
filters['version'] = self._get_version().slug

# Check to avoid searching all projects in case these filters are empty.
if not filters['project']:
log.info('Unable to find a project to search')
return []
if not filters['version']:
log.info('Unable to find a version to search')
return []
if main_project.has_feature(Feature.SEARCH_SUBPROJECTS_ON_DEFAULT_VERSION):
projects = {
project: version.slug
for project, version in self._get_all_projects_data().items()
}
# Check to avoid searching all projects in case it's empty.
if not projects:
log.info('Unable to find a version to search')
return []
else:
filters['project'] = list(self._get_all_projects_data().keys())
filters['version'] = main_version.slug
# Check to avoid searching all projects in case these filters are empty.
if not filters['project']:
log.info('Unable to find a project to search')
return []
if not filters['version']:
log.info('Unable to find a version to search')
return []

query = self.request.query_params['q']
queryset = PageSearch(
query=query,
projects=projects,
filters=filters,
user=self.request.user,
# We use a permission class to control authorization
filter_by_user=False,
use_advanced_query=not self._get_project().has_feature(Feature.DEFAULT_TO_FUZZY_SEARCH),
use_advanced_query=not main_project.has_feature(Feature.DEFAULT_TO_FUZZY_SEARCH),
)
return queryset

Expand Down
42 changes: 37 additions & 5 deletions readthedocs/search/faceted_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
MultiMatch,
Nested,
SimpleQueryString,
Term,
Wildcard,
)

Expand All @@ -35,12 +36,23 @@ class RTDFacetedSearch(FacetedSearch):
'post_tags': ['</span>'],
}

def __init__(self, query=None, filters=None, user=None, use_advanced_query=True, **kwargs):
def __init__(
self,
query=None,
filters=None,
projects=None,
user=None,
use_advanced_query=True,
**kwargs,
):
"""
Pass in a user in order to filter search results by privacy.

If `use_advanced_query` is `True`,
force to always use `SimpleQueryString` for the text query object.
:param projects: A dictionary of project slugs mapped to a `VersionData` object.
Results are filter with these values.

:param use_advanced_query: If `True` forces to always use
`SimpleQueryString` for the text query object.

.. warning::

Expand All @@ -50,6 +62,7 @@ def __init__(self, query=None, filters=None, user=None, use_advanced_query=True,
self.user = user
self.filter_by_user = kwargs.pop('filter_by_user', True)
self.use_advanced_query = use_advanced_query
self.projects = projects or {}

# Hack a fix to our broken connection pooling
# This creates a new connection on every request,
Expand Down Expand Up @@ -259,7 +272,12 @@ def total_count(self):
return s.hits.total

def query(self, search, query):
"""Manipulates the query to support nested queries and a custom rank for pages."""
"""
Manipulates the query to support nested queries and a custom rank for pages.

If `self.projects` was given, we use it to filter the documents that
match the same project and version.
"""
search = search.highlight_options(**self._highlight_options)

queries = self._get_queries(
Expand All @@ -280,8 +298,22 @@ def query(self, search, query):
)

queries.extend([sections_nested_query, domains_nested_query])
bool_query = Bool(should=queries)

if self.projects:
versions_query = [
Bool(
must=[
Term(project={'value': project}),
Term(version={'value': version}),
]
)
for project, version in self.projects.items()
]
bool_query = Bool(must=[bool_query, Bool(should=versions_query)])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I"m a little worried about this as you noted, but lets see how it performs in prod.


final_query = FunctionScore(
query=Bool(should=queries),
query=bool_query,
script_score=self._get_script_score(),
)
search = search.query(final_query)
Expand Down
18 changes: 11 additions & 7 deletions readthedocs/search/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
from readthedocs.projects.models import Project


# Structure used for storing cached data of a project mostly.
ProjectData = namedtuple('ProjectData', ['docs_url', 'version_doctype'])
# Structure used for storing cached data of a version mostly.
VersionData = namedtuple('VersionData', ['slug', 'docs_url', 'doctype'])


class ProjectHighlightSerializer(serializers.Serializer):
Expand Down Expand Up @@ -88,14 +88,14 @@ def _get_full_path(self, obj):
it's cached into ``project_data``.
"""
# First try to build the URL from the context.
project_data = self.context.get('projects_data', {}).get(obj.project)
if project_data:
docs_url, doctype = project_data
version_data = self.context.get('projects_data', {}).get(obj.project)
if version_data:
docs_url = version_data.docs_url
path = obj.full_path

# Generate an appropriate link for the doctypes that use htmldir,
# and always end it with / so it goes directly to proxito.
if doctype in {SPHINX_HTMLDIR, MKDOCS}:
if version_data.doctype in {SPHINX_HTMLDIR, MKDOCS}:
path = re.sub('(^|/)index.html$', '/', path)

return docs_url.rstrip('/') + '/' + path.lstrip('/')
Expand All @@ -106,7 +106,11 @@ def _get_full_path(self, obj):
docs_url = project.get_docs_url(version_slug=obj.version)
# cache the project URL
projects_data = self.context.setdefault('projects_data', {})
projects_data[obj.project] = ProjectData(docs_url, '')
projects_data[obj.project] = VersionData(
slug=obj.version,
docs_url=docs_url,
doctype=None,
)
return docs_url + obj.full_path

return None
Expand Down
Loading