diff --git a/readthedocs/analytics/models.py b/readthedocs/analytics/models.py index b0fa01f4cec..8be550d8a06 100644 --- a/readthedocs/analytics/models.py +++ b/readthedocs/analytics/models.py @@ -45,7 +45,7 @@ def __str__(self): return f'PageView: [{self.project.slug}:{self.version.slug}] - {self.path} for {self.date}' @classmethod - def top_viewed_pages(cls, project, since=None, limit=10): + def top_viewed_pages(cls, project_slug, version_slug=None, since=None, limit=10): """ Returns top pages according to view counts. @@ -64,8 +64,14 @@ def top_viewed_pages(cls, project, since=None, limit=10): queryset = ( cls.objects - .filter(project=project, date__gte=since) - .values_list('path') + .filter(project__slug=project_slug, date__gte=since) + ) + + if version_slug: + queryset = queryset.filter(version__slug=version_slug) + + queryset = ( + queryset .annotate(total_views=Sum('view_count')) .values_list('path', 'total_views') .order_by('-total_views')[:limit] @@ -74,9 +80,9 @@ def top_viewed_pages(cls, project, since=None, limit=10): pages = [] view_counts = [] - for data in queryset.iterator(): - pages.append(data[0]) - view_counts.append(data[1]) + for page, views in queryset.iterator(): + pages.append(page) + view_counts.append(views) final_data = { 'pages': pages, diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index c5d7f6000ec..ca337090c76 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -1606,6 +1606,7 @@ def add_features(sender, **kwargs): ENABLE_MKDOCS_SERVER_SIDE_SEARCH = 'enable_mkdocs_server_side_search' DEFAULT_TO_FUZZY_SEARCH = 'default_to_fuzzy_search' INDEX_FROM_HTML_FILES = 'index_from_html_files' + USE_PAGE_VIEWS_IN_SEARCH_RESULTS = 'use_page_views_in_search_results' LIST_PACKAGES_INSTALLED_ENV = 'list_packages_installed_env' VCS_REMOTE_LISTING = 'vcs_remote_listing' @@ -1719,6 +1720,10 @@ def add_features(sender, **kwargs): INDEX_FROM_HTML_FILES, _('Index content directly from html files instead or relying in other sources'), ), + ( + USE_PAGE_VIEWS_IN_SEARCH_RESULTS, + _('Weight the number of page views into search results'), + ), ( LIST_PACKAGES_INSTALLED_ENV, diff --git a/readthedocs/projects/views/private.py b/readthedocs/projects/views/private.py index 264bc57d3cf..75fe007f609 100644 --- a/readthedocs/projects/views/private.py +++ b/readthedocs/projects/views/private.py @@ -10,12 +10,11 @@ from django.http import ( Http404, HttpResponseBadRequest, - HttpResponseNotAllowed, HttpResponseRedirect, StreamingHttpResponse, ) from django.middleware.csrf import get_token -from django.shortcuts import get_object_or_404, render +from django.shortcuts import get_object_or_404 from django.urls import reverse from django.utils import timezone from django.utils.safestring import mark_safe @@ -40,10 +39,7 @@ Version, VersionAutomationRule, ) -from readthedocs.core.mixins import ( - ListViewWithForm, - PrivateViewMixin, -) +from readthedocs.core.mixins import ListViewWithForm, PrivateViewMixin from readthedocs.core.utils import trigger_build from readthedocs.core.utils.extend import SettingsOverrideObject from readthedocs.integrations.models import HttpExchange, Integration @@ -72,7 +68,6 @@ Domain, EmailHook, EnvironmentVariable, - Feature, Project, ProjectRelationship, WebHook, @@ -1129,7 +1124,10 @@ def get_context_data(self, **kwargs): return context # Count of views for top pages over the month - top_pages = PageView.top_viewed_pages(project, limit=25) + top_pages = PageView.top_viewed_pages( + project_slug=project.slug, + limit=25, + ) top_viewed_pages = list(zip( top_pages['pages'], top_pages['view_counts'] diff --git a/readthedocs/search/api.py b/readthedocs/search/api.py index 96c2c3dbdec..6c14730433f 100644 --- a/readthedocs/search/api.py +++ b/readthedocs/search/api.py @@ -318,6 +318,9 @@ def get_queryset(self): projects=projects, aggregate_results=False, use_advanced_query=not main_project.has_feature(Feature.DEFAULT_TO_FUZZY_SEARCH), + use_page_views=main_project.has_feature( + Feature.USE_PAGE_VIEWS_IN_SEARCH_RESULTS, + ), ) return queryset diff --git a/readthedocs/search/faceted_search.py b/readthedocs/search/faceted_search.py index fa24d5abd0a..7bb68e971fc 100644 --- a/readthedocs/search/faceted_search.py +++ b/readthedocs/search/faceted_search.py @@ -16,6 +16,7 @@ Wildcard, ) +from readthedocs.analytics.models import PageView from readthedocs.search.documents import PageDocument, ProjectDocument log = logging.getLogger(__name__) @@ -48,6 +49,7 @@ def __init__( projects=None, aggregate_results=True, use_advanced_query=True, + use_page_views=False, **kwargs, ): """ @@ -58,8 +60,9 @@ def __init__( :param projects: A dictionary of project slugs mapped to a `VersionData` object. Or a list of project slugs. Results are filter with these values. - :param use_advanced_query: If `True` forces to always use + :param bool use_advanced_query: If `True` forces to always use `SimpleQueryString` for the text query object. + :param bool use_page_views: Is `True`, weight page views into the search results. :param bool aggregate_results: If results should be aggregated, this is returning the number of results within other facets. :param bool use_advanced_query: Always use SimpleQueryString. @@ -68,6 +71,7 @@ def __init__( self.use_advanced_query = use_advanced_query self.aggregate_results = aggregate_results self.projects = projects or {} + self.use_page_views = use_page_views # Hack a fix to our broken connection pooling # This creates a new connection on every request, @@ -380,14 +384,92 @@ def _get_nested_query(self, *, query, path, fields): def _get_script_score(self): """ - Gets an ES script to map the page rank to a valid score weight. + Gets an ES script that combines the page rank and views into the final score. + + **Page ranking weight calculation** + + Each rank maps to an element in the ranking list. + -10 will map to the first element (-10 + 10 = 0) and so on. + + **Page views weight calculation** + + We calculate two values: + + - absolute: this is equal to ``log10(views + 1)`` + (we add one since logarithms start at 1). + A logarithmic function is a good fit due to its growth rate. + - relative: this is equal to the ratio between the number of views of the current page + and the max number of views of the current version. + + Those two values are added and multiplied by a weight (``views_factor``). + + .. note:: + + We can also make use of the ratio between the number of views + and the average of views of the current version. + + **Final score** + + To generate the final score, + all weights are added and multiplied by the original score. + + Docs about the script score query and the painless language at: + + - https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-script-score-query.html#field-value-factor # noqa + - https://www.elastic.co/guide/en/elasticsearch/painless/6.8/painless-api-reference.html + """ + source = """ + // Page ranking weight. + int rank = doc['rank'].size() == 0 ? 0 : (int) doc['rank'].value; + double ranking = params.ranking[rank + 10]; + + // Page views weight. + int views = 0; + int max_views = 0; + String project = doc['project'].value; + String version = doc['version'].value; + String path = doc['full_path'].value; + + Map pages = params.top_pages.get(project); + if (pages != null) { + pages = pages.get(version); + if (pages != null) { + views = (int) pages.get("pages").getOrDefault(path, 0); + max_views = (int) pages.get("max"); + } + } + double absolute_views = Math.log10(views + 1); + double relative_views = 0; + if (max_views > 0) { + relative_views = views/max_views; + } + double views_weight = (absolute_views + relative_views) * params.views_factor; + + // Combine all weights into a final score. + return (ranking + views_weight) * _score; + """ + return { + "script": { + "source": source, + "params": { + "ranking": self._get_ranking(), + "top_pages": self._get_top_pages(), + "views_factor": 1/10, + }, + }, + } + + def _get_ranking(self): + """ + Get ranking for pages. ES expects the rank to be a number greater than 0, but users can set this between [-10, +10]. We map that range to [0.01, 2] (21 possible values). - The first lower rank (0.8) needs to bring the score from the highest boost (sections.title^2) - close to the lowest boost (title^1.5), that way exact results take priority: + The first lower rank (0.8) needs to bring the score from the highest boost + (sections.title^2) close to the lowest boost (title^1.5), that way exact + results can still take priority: - 2.0 * 0.8 = 1.6 (score close to 1.5, but not lower than it) - 1.5 * 0.8 = 1.2 (score lower than 1.5) @@ -399,8 +481,6 @@ def _get_script_score(self): - 1.5 * 1.3 = 1.95 (score close to 2.0, but not higher than it) The next lower and higher ranks need to decrease/increase both scores. - - See https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-script-score-query.html#field-value-factor # noqa """ ranking = [ 0.01, @@ -425,15 +505,59 @@ def _get_script_score(self): 1.96, 2, ] - # Each rank maps to a element in the ranking list. - # -10 will map to the first element (-10 + 10 = 0) and so on. - source = """ - int rank = doc['rank'].size() == 0 ? 0 : (int) doc['rank'].value; - return params.ranking[rank + 10] * _score; + return ranking + + def _get_top_pages(self): """ - return { - "script": { - "source": source, - "params": {"ranking": ranking}, - }, - } + Get the top 100 pages for the versions of the current projects. + + Returns a dictionary with the following structure: + + { + 'project': { + 'version': { + 'max': max_views, + 'pages': { + 'page': views, + }, + }, + }, + } + + The number of views can be between 0 and 2**31 - 9, + this is so we don't overflow when casting the value to an integer + inside ES, this also gives us a max value to work on and some space for + additional operations. + """ + try: + if not self.use_page_views: + return {} + + project = self.filter_values['project'][0] + version = self.filter_values['version'][0] + top_pages_data = PageView.top_viewed_pages( + project_slug=project, + version_slug=version, + top=100, + ) + if not top_pages_data['pages'] or not top_pages_data['view_counts']: + return {} + + max_int = 2**31 - 9 + top_pages_for_version = { + page: min(views, max_int) + for page, views in zip(top_pages_data['pages'], top_pages_data['view_counts']) + } + top_pages = { + project: {version: {'pages': top_pages_for_version}} + } + + # Calculate the max views from each version. + for project_data in top_pages.values(): + for version_data in project_data.values(): + pages = version_data['pages'] + max_ = max(pages.values()) + version_data['max'] = max_ + return top_pages + except (KeyError, IndexError): + return {} diff --git a/readthedocs/search/tests/test_api.py b/readthedocs/search/tests/test_api.py index 641a1217906..61954d566fd 100644 --- a/readthedocs/search/tests/test_api.py +++ b/readthedocs/search/tests/test_api.py @@ -5,6 +5,7 @@ from django.urls import reverse from django_dynamic_fixture import get +from readthedocs.analytics.models import PageView from readthedocs.builds.models import Version from readthedocs.projects.constants import ( MKDOCS, @@ -749,6 +750,225 @@ def test_search_custom_ranking(self, api_client): assert results[0]['path'] == '/en/latest/index.html' assert results[1]['path'] == '/en/latest/guides/index.html' + def test_search_page_views(self, api_client): + project = Project.objects.get(slug='docs') + version = project.versions.all().first() + + feature, _ = Feature.objects.get_or_create( + feature_id=Feature.USE_PAGE_VIEWS_IN_SEARCH_RESULTS, + ) + project.feature_set.add(feature) + project.save() + + index_page = get( + PageView, + project=project, + version=version, + path='index.html', + view_count=0, + ) + guide_page = get( + PageView, + project=project, + version=version, + path='guides/index.html', + view_count=0, + ) + + search_params = { + 'project': project.slug, + 'version': version.slug, + 'q': '"content from"', + } + resp = self.get_search(api_client, search_params) + assert resp.status_code == 200 + + # Normal ordering + results = resp.data['results'] + assert len(results) == 2 + assert results[0]['path'] == '/en/latest/index.html' + assert results[1]['path'] == '/en/latest/guides/index.html' + + # Query with same number of page views. + # The ordering isn't affected. + index_page.view_count = 200 + index_page.save() + guide_page.view_count = 200 + guide_page.save() + + resp = self.get_search(api_client, search_params) + assert resp.status_code == 200 + results = resp.data['results'] + assert len(results) == 2 + assert results[0]['path'] == '/en/latest/index.html' + assert results[1]['path'] == '/en/latest/guides/index.html' + + # Query with guides/index.html having more views than index.html. + index_page.view_count = 200 + index_page.save() + guide_page.view_count = 8000 + guide_page.save() + + resp = self.get_search(api_client, search_params) + assert resp.status_code == 200 + results = resp.data['results'] + assert len(results) == 2 + assert results[0]['path'] == '/en/latest/guides/index.html' + assert results[1]['path'] == '/en/latest/index.html' + + # Query with guides/index.html having more views than index.html. + index_page.view_count = 200 + index_page.save() + guide_page.view_count = 400 + guide_page.save() + + resp = self.get_search(api_client, search_params) + assert resp.status_code == 200 + results = resp.data['results'] + assert len(results) == 2 + assert results[0]['path'] == '/en/latest/guides/index.html' + assert results[1]['path'] == '/en/latest/index.html' + + # Query with index.html having more views than guides/index.html. + index_page.view_count = 6000 + index_page.save() + guide_page.view_count = 1200 + guide_page.save() + + resp = self.get_search(api_client, search_params) + assert resp.status_code == 200 + results = resp.data['results'] + assert len(results) == 2 + assert results[0]['path'] == '/en/latest/index.html' + assert results[1]['path'] == '/en/latest/guides/index.html' + + def test_search_ranking_and_page_views(self, api_client): + project = Project.objects.get(slug='docs') + version = project.versions.all().first() + + feature, _ = Feature.objects.get_or_create( + feature_id=Feature.USE_PAGE_VIEWS_IN_SEARCH_RESULTS, + ) + project.feature_set.add(feature) + project.save() + + page_index = HTMLFile.objects.get(path='index.html') + page_guides = HTMLFile.objects.get(path='guides/index.html') + + page_view_index = get( + PageView, + project=project, + version=version, + path='index.html', + view_count=0, + ) + page_view_guides = get( + PageView, + project=project, + version=version, + path='guides/index.html', + view_count=0, + ) + + search_params = { + 'project': project.slug, + 'version': version.slug, + 'q': '"content from"', + } + + # Query with the default ranking and 0 page views. + assert page_index.rank == 0 + assert page_guides.rank == 0 + assert page_view_index.view_count == 0 + assert page_view_guides.view_count == 0 + + resp = self.get_search(api_client, search_params) + assert resp.status_code == 200 + + results = resp.data['results'] + assert len(results) == 2 + assert results[0]['path'] == '/en/latest/index.html' + assert results[1]['path'] == '/en/latest/guides/index.html' + + # Query with a higher rank over guides/index.html, + # and more page views on index.html. + # Ranking has more priority than page views. + page_guides.rank = 5 + page_guides.save() + PageDocument().update(page_guides) + + page_view_index.view_count = 800 + page_view_index.save() + + resp = self.get_search(api_client, search_params) + assert resp.status_code == 200 + + results = resp.data['results'] + assert len(results) == 2 + assert results[0]['path'] == '/en/latest/guides/index.html' + assert results[1]['path'] == '/en/latest/index.html' + + # Query with a lower rank over index.html, + # and more page views on guides/index.html. + # Page views has more priority. + page_index.rank = -1 + page_index.save() + page_guides.rank = 0 + page_guides.save() + PageDocument().update(page_index) + PageDocument().update(page_guides) + + page_view_index.view_count = 200 + page_view_index.save() + page_view_guides.view_count = 800 + page_view_guides.save() + + resp = self.get_search(api_client, search_params) + assert resp.status_code == 200 + + results = resp.data['results'] + assert len(results) == 2 + assert results[0]['path'] == '/en/latest/guides/index.html' + assert results[1]['path'] == '/en/latest/index.html' + + # Query with a rank of 1 over index.html, + # and more page views on guides/index.html. + # A relative higher number of page views has more priority over a rank of 1. + page_index.rank = 1 + page_index.save() + page_guides.rank = 0 + page_guides.save() + PageDocument().update(page_index) + PageDocument().update(page_guides) + + page_view_index.view_count = 200 + page_view_index.save() + page_view_guides.view_count = 60000 + page_view_guides.save() + + resp = self.get_search(api_client, search_params) + assert resp.status_code == 200 + + results = resp.data['results'] + assert len(results) == 2 + assert results[0]['path'] == '/en/latest/guides/index.html' + assert results[1]['path'] == '/en/latest/index.html' + + # Query with a rank of 2 over index.html, + # and more page views on guides/index.html. + # A relative higher number of page views has less priority over a rank of 2. + page_index.rank = 2 + page_index.save() + PageDocument().update(page_index) + + resp = self.get_search(api_client, search_params) + assert resp.status_code == 200 + + results = resp.data['results'] + assert len(results) == 2 + assert results[0]['path'] == '/en/latest/index.html' + assert results[1]['path'] == '/en/latest/guides/index.html' + def test_search_ignore(self, api_client): project = Project.objects.get(slug='docs') version = project.versions.all().first()