Skip to content

Search: allow to search on different versions of subprojects #7634

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Nov 19, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 24 additions & 10 deletions readthedocs/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1568,8 +1568,14 @@ def add_features(sender, **kwargs):
SKIP_SYNC_VERSIONS = 'skip_sync_versions'
CACHED_ENVIRONMENT = 'cached_environment'
LIMIT_CONCURRENT_BUILDS = 'limit_concurrent_builds'

# Search related features
DISABLE_SERVER_SIDE_SEARCH = 'disable_server_side_search'
ENABLE_MKDOCS_SERVER_SIDE_SEARCH = 'enable_mkdocs_server_side_search'
DEFAULT_TO_FUZZY_SEARCH = 'default_to_fuzzy_search'
INDEX_FROM_HTML_FILES = 'index_from_html_files'
SEARCH_SUBPROJECTS_ON_DEFAULT_VERSION = 'search_subprojects_on_default_version'

FORCE_SPHINX_FROM_VENV = 'force_sphinx_from_venv'
LIST_PACKAGES_INSTALLED_ENV = 'list_packages_installed_env'
VCS_REMOTE_LISTING = 'vcs_remote_listing'
Expand All @@ -1578,8 +1584,6 @@ def add_features(sender, **kwargs):
USE_SPHINX_BUILDERS = 'use_sphinx_builders'
DEDUPLICATE_BUILDS = 'deduplicate_builds'
USE_SPHINX_RTD_EXT_LATEST = 'rtd_sphinx_ext_latest'
DEFAULT_TO_FUZZY_SEARCH = 'default_to_fuzzy_search'
INDEX_FROM_HTML_FILES = 'index_from_html_files'
DONT_CREATE_INDEX = 'dont_create_index'
USE_NEW_PIP_RESOLVER = 'use_new_pip_resolver'
DONT_INSTALL_LATEST_PIP = 'dont_install_latest_pip'
Expand Down Expand Up @@ -1667,6 +1671,8 @@ def add_features(sender, **kwargs):
LIMIT_CONCURRENT_BUILDS,
_('Limit the amount of concurrent builds'),
),

# Search related features.
(
DISABLE_SERVER_SIDE_SEARCH,
_('Disable server side search'),
Expand All @@ -1675,6 +1681,22 @@ def add_features(sender, **kwargs):
ENABLE_MKDOCS_SERVER_SIDE_SEARCH,
_('Enable server side search for MkDocs projects'),
),
(
DEFAULT_TO_FUZZY_SEARCH,
_('Default to fuzzy search for simple search queries'),
),
(
INDEX_FROM_HTML_FILES,
_('Index content directly from html files instead or relying in other sources'),
),
(
SEARCH_SUBPROJECTS_ON_DEFAULT_VERSION,
_(
'When searching subprojects default to its default version if it doesn\'t '
'have the same version as the main project'
),
),

(
FORCE_SPHINX_FROM_VENV,
_('Force to use Sphinx from the current virtual environment'),
Expand Down Expand Up @@ -1710,14 +1732,6 @@ def add_features(sender, **kwargs):
USE_SPHINX_RTD_EXT_LATEST,
_('Use latest version of the Read the Docs Sphinx extension'),
),
(
DEFAULT_TO_FUZZY_SEARCH,
_('Default to fuzzy search for simple search queries'),
),
(
INDEX_FROM_HTML_FILES,
_('Index content directly from html files instead or relying in other sources'),
),
(
DONT_CREATE_INDEX,
_('Do not create index.md or README.rst if the project does not have one.'),
Expand Down
54 changes: 40 additions & 14 deletions readthedocs/search/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,14 +227,26 @@ def _get_all_projects_data(self):
subprojects = Project.objects.filter(
superprojects__parent_id=main_project.id,
)
for project in subprojects:
for subproject in subprojects:
version = self._get_subproject_version(
version_slug=main_version.slug,
subproject=project,
subproject=subproject,
)

# Fallback to the default version of the subproject.
if (
not version
and main_project.has_feature(Feature.SEARCH_SUBPROJECTS_ON_DEFAULT_VERSION)
and subproject.default_version
):
version = self._get_subproject_version(
version_slug=subproject.default_version,
subproject=subproject,
)

if version and self._has_permission(self.request.user, version):
url = project.get_docs_url(version_slug=version.slug)
projects_data[project.slug] = VersionData(
url = subproject.get_docs_url(version_slug=version.slug)
projects_data[subproject.slug] = VersionData(
slug=version.slug,
doctype=version.documentation_type,
docs_url=url,
Expand Down Expand Up @@ -290,26 +302,40 @@ def get_queryset(self):
calling ``search.execute().hits``. This is why an DSL search object
is compatible with DRF's paginator.
"""
main_project = self._get_project()
main_version = self._get_version()
projects = {}
filters = {}
filters['project'] = list(self._get_all_projects_data().keys())
filters['version'] = self._get_version().slug

# Check to avoid searching all projects in case these filters are empty.
if not filters['project']:
log.info('Unable to find a project to search')
return []
if not filters['version']:
log.info('Unable to find a version to search')
return []
if main_project.has_feature(Feature.SEARCH_SUBPROJECTS_ON_DEFAULT_VERSION):
projects = {
project: version.slug
for project, version in self._get_all_projects_data().items()
}
# Check to avoid searching all projects in case it's empty.
if not projects:
log.info('Unable to find a version to search')
return []
else:
filters['project'] = list(self._get_all_projects_data().keys())
filters['version'] = main_version.slug
# Check to avoid searching all projects in case these filters are empty.
if not filters['project']:
log.info('Unable to find a project to search')
return []
if not filters['version']:
log.info('Unable to find a version to search')
return []

query = self.request.query_params['q']
queryset = PageSearch(
query=query,
projects=projects,
filters=filters,
user=self.request.user,
# We use a permission class to control authorization
filter_by_user=False,
use_advanced_query=not self._get_project().has_feature(Feature.DEFAULT_TO_FUZZY_SEARCH),
use_advanced_query=not main_project.has_feature(Feature.DEFAULT_TO_FUZZY_SEARCH),
)
return queryset

Expand Down
42 changes: 37 additions & 5 deletions readthedocs/search/faceted_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
MultiMatch,
Nested,
SimpleQueryString,
Term,
Wildcard,
)

Expand Down Expand Up @@ -38,12 +39,23 @@ class RTDFacetedSearch(FacetedSearch):
'post_tags': ['</span>'],
}

def __init__(self, query=None, filters=None, user=None, use_advanced_query=True, **kwargs):
def __init__(
self,
query=None,
filters=None,
projects=None,
user=None,
use_advanced_query=True,
**kwargs,
):
"""
Pass in a user in order to filter search results by privacy.

If `use_advanced_query` is `True`,
force to always use `SimpleQueryString` for the text query object.
:param projects: A dictionary of project slugs mapped to a `VersionData` object.
Results are filter with these values.

:param use_advanced_query: If `True` forces to always use
`SimpleQueryString` for the text query object.

.. warning::

Expand All @@ -53,6 +65,7 @@ def __init__(self, query=None, filters=None, user=None, use_advanced_query=True,
self.user = user
self.filter_by_user = kwargs.pop('filter_by_user', True)
self.use_advanced_query = use_advanced_query
self.projects = projects or {}

# Hack a fix to our broken connection pooling
# This creates a new connection on every request,
Expand Down Expand Up @@ -265,7 +278,12 @@ def total_count(self):
return s.hits.total

def query(self, search, query):
"""Manipulates the query to support nested queries and a custom rank for pages."""
"""
Manipulates the query to support nested queries and a custom rank for pages.

If `self.projects` was given, we use it to filter the documents that
match the same project and version.
"""
search = search.highlight_options(**self._highlight_options)
search = search.source(excludes=self.excludes)

Expand All @@ -287,8 +305,22 @@ def query(self, search, query):
)

queries.extend([sections_nested_query, domains_nested_query])
bool_query = Bool(should=queries)

if self.projects:
versions_query = [
Bool(
must=[
Term(project={'value': project}),
Term(version={'value': version}),
]
)
for project, version in self.projects.items()
]
bool_query = Bool(must=[bool_query, Bool(should=versions_query)])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I"m a little worried about this as you noted, but lets see how it performs in prod.


final_query = FunctionScore(
query=Bool(should=queries),
query=bool_query,
script_score=self._get_script_score(),
)
search = search.query(final_query)
Expand Down
51 changes: 51 additions & 0 deletions readthedocs/search/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,11 +264,62 @@ def test_doc_search_subprojects(self, api_client, all_projects):
# First result should be the subproject
first_result = data[0]
assert first_result['project'] == subproject.slug
# The result is from the same version as the main project.
assert first_result['version'] == version.slug
# Check the link is the subproject document link
document_link = subproject.get_docs_url(version_slug=version.slug)
link = first_result['domain'] + first_result['path']
assert document_link in link

def test_doc_search_subprojects_default_version(self, api_client, all_projects):
"""Return results from subprojects that match the version from the main project or fallback to its default version."""
project = all_projects[0]
version = project.versions.all()[0]
feature, _ = Feature.objects.get_or_create(
feature_id=Feature.SEARCH_SUBPROJECTS_ON_DEFAULT_VERSION,
)
project.feature_set.add(feature)

subproject = all_projects[1]
subproject_version = subproject.versions.all()[0]

# Change the name of the version, and make it default.
subproject_version.slug = 'different'
subproject_version.save()
subproject.default_version = subproject_version.slug
subproject.save()
subproject.versions.filter(slug=version.slug).delete()

# Refresh index
version_files = HTMLFile.objects.all().filter(version=subproject_version)
for f in version_files:
PageDocument().update(f)

# Add another project as subproject of the project
project.add_subproject(subproject)

# Now search with subproject content but explicitly filter by the parent project
query = get_search_query_from_project_file(project_slug=subproject.slug)
search_params = {
'q': query,
'project': project.slug,
'version': version.slug
}
resp = self.get_search(api_client, search_params)
assert resp.status_code == 200

data = resp.data['results']
assert len(data) >= 1 # there may be results from another projects

# First result should be the subproject
first_result = data[0]
assert first_result['project'] == subproject.slug
assert first_result['version'] == 'different'
# Check the link is the subproject document link
document_link = subproject.get_docs_url(version_slug=subproject_version.slug)
link = first_result['domain'] + first_result['path']
assert document_link in link

def test_doc_search_unexisting_project(self, api_client):
project = 'notfound'
assert not Project.objects.filter(slug=project).exists()
Expand Down