readthedocs · stsewd · Jul 15, 2020 · Jul 15, 2020 · Jul 15, 2020 · Jul 15, 2020
@@ -45,7 +45,7 @@ def __str__(self):
         return f'PageView: [{self.project.slug}:{self.version.slug}] - {self.path} for {self.date}'
 
     @classmethod
-    def top_viewed_pages(cls, project, since=None, limit=10):
+    def top_viewed_pages(cls, project_slug, version_slug=None, since=None, limit=10):
         """
         Returns top pages according to view counts.
 
@@ -64,8 +64,14 @@ def top_viewed_pages(cls, project, since=None, limit=10):
 
         queryset = (
             cls.objects
-            .filter(project=project, date__gte=since)
-            .values_list('path')
+            .filter(project__slug=project_slug, date__gte=since)
+        )
+
+        if version_slug:
+            queryset = queryset.filter(version__slug=version_slug)
+
+        queryset = (
+            queryset
             .annotate(total_views=Sum('view_count'))
             .values_list('path', 'total_views')
             .order_by('-total_views')[:limit]
@@ -74,9 +80,9 @@ def top_viewed_pages(cls, project, since=None, limit=10):
         pages = []
         view_counts = []
 
-        for data in queryset.iterator():
-            pages.append(data[0])
-            view_counts.append(data[1])
+        for page, views in queryset.iterator():
+            pages.append(page)
+            view_counts.append(views)
 
         final_data = {
             'pages': pages,

@@ -1606,6 +1606,7 @@ def add_features(sender, **kwargs):
     ENABLE_MKDOCS_SERVER_SIDE_SEARCH = 'enable_mkdocs_server_side_search'
     DEFAULT_TO_FUZZY_SEARCH = 'default_to_fuzzy_search'
     INDEX_FROM_HTML_FILES = 'index_from_html_files'
+    USE_PAGE_VIEWS_IN_SEARCH_RESULTS = 'use_page_views_in_search_results'
 
     LIST_PACKAGES_INSTALLED_ENV = 'list_packages_installed_env'
     VCS_REMOTE_LISTING = 'vcs_remote_listing'
@@ -1719,6 +1720,10 @@ def add_features(sender, **kwargs):
             INDEX_FROM_HTML_FILES,
             _('Index content directly from html files instead or relying in other sources'),
         ),
+        (
+            USE_PAGE_VIEWS_IN_SEARCH_RESULTS,
+            _('Weight the number of page views into search results'),
+        ),
 
         (
             LIST_PACKAGES_INSTALLED_ENV,

@@ -10,12 +10,11 @@
 from django.http import (
     Http404,
     HttpResponseBadRequest,
-    HttpResponseNotAllowed,
     HttpResponseRedirect,
     StreamingHttpResponse,
 )
 from django.middleware.csrf import get_token
-from django.shortcuts import get_object_or_404, render
+from django.shortcuts import get_object_or_404
 from django.urls import reverse
 from django.utils import timezone
 from django.utils.safestring import mark_safe
@@ -40,10 +39,7 @@
     Version,
     VersionAutomationRule,
 )
-from readthedocs.core.mixins import (
-    ListViewWithForm,
-    PrivateViewMixin,
-)
+from readthedocs.core.mixins import ListViewWithForm, PrivateViewMixin
 from readthedocs.core.utils import trigger_build
 from readthedocs.core.utils.extend import SettingsOverrideObject
 from readthedocs.integrations.models import HttpExchange, Integration
@@ -72,7 +68,6 @@
     Domain,
     EmailHook,
     EnvironmentVariable,
-    Feature,
     Project,
     ProjectRelationship,
     WebHook,
@@ -1129,7 +1124,10 @@ def get_context_data(self, **kwargs):
             return context
 
         # Count of views for top pages over the month
-        top_pages = PageView.top_viewed_pages(project, limit=25)
+        top_pages = PageView.top_viewed_pages(
+            project_slug=project.slug,
+            limit=25,
+        )
         top_viewed_pages = list(zip(
             top_pages['pages'],
             top_pages['view_counts']

@@ -318,6 +318,9 @@ def get_queryset(self):
             projects=projects,
             aggregate_results=False,
             use_advanced_query=not main_project.has_feature(Feature.DEFAULT_TO_FUZZY_SEARCH),
+            use_page_views=main_project.has_feature(
+                Feature.USE_PAGE_VIEWS_IN_SEARCH_RESULTS,
+            ),
         )
         return queryset
 

@@ -16,6 +16,7 @@
     Wildcard,
 )
 
+from readthedocs.analytics.models import PageView
 from readthedocs.search.documents import PageDocument, ProjectDocument
 
 log = logging.getLogger(__name__)
@@ -48,6 +49,7 @@ def __init__(
             projects=None,
             aggregate_results=True,
             use_advanced_query=True,
+            use_page_views=False,
             **kwargs,
     ):
         """
@@ -58,8 +60,9 @@ def __init__(
         :param projects: A dictionary of project slugs mapped to a `VersionData` object.
          Or a list of project slugs.
          Results are filter with these values.
-        :param use_advanced_query: If `True` forces to always use
+        :param bool use_advanced_query: If `True` forces to always use
          `SimpleQueryString` for the text query object.
+        :param bool use_page_views: Is `True`, weight page views into the search results.
         :param bool aggregate_results: If results should be aggregated,
          this is returning the number of results within other facets.
         :param bool use_advanced_query: Always use SimpleQueryString.
@@ -68,6 +71,7 @@ def __init__(
         self.use_advanced_query = use_advanced_query
         self.aggregate_results = aggregate_results
         self.projects = projects or {}
+        self.use_page_views = use_page_views
 
         # Hack a fix to our broken connection pooling
         # This creates a new connection on every request,
@@ -380,14 +384,92 @@ def _get_nested_query(self, *, query, path, fields):
 
     def _get_script_score(self):
         """
-        Gets an ES script to map the page rank to a valid score weight.
+        Gets an ES script that combines the page rank and views into the final score.
+
+        **Page ranking weight calculation**
+
+        Each rank maps to an element in the ranking list.
+        -10 will map to the first element (-10 + 10 = 0) and so on.
+
+        **Page views weight calculation**
+
+        We calculate two values:
+
+        - absolute: this is equal to ``log10(views + 1)``
+          (we add one since logarithms start at 1).
+          A logarithmic function is a good fit due to its growth rate.
+        - relative: this is equal to the ratio between the number of views of the current page
+          and the max number of views of the current version.
+
+        Those two values are added and multiplied by a weight (``views_factor``).
+
+        .. note::
+
+           We can also make use of the ratio between the number of views
+           and the average of views of the current version.
+
+        **Final score**
+
+        To generate the final score,
+        all weights are added and multiplied by the original score.
+
+        Docs about the script score query and the painless language at:
+
+        - https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-script-score-query.html#field-value-factor  # noqa
+        - https://www.elastic.co/guide/en/elasticsearch/painless/6.8/painless-api-reference.html
+        """
+        source = """
+            // Page ranking weight.
+            int rank = doc['rank'].size() == 0 ? 0 : (int) doc['rank'].value;
+            double ranking = params.ranking[rank + 10];
+
+            // Page views weight.
+            int views = 0;
+            int max_views = 0;
+            String project = doc['project'].value;
+            String version = doc['version'].value;
+            String path = doc['full_path'].value;
+
+            Map pages = params.top_pages.get(project);
+            if (pages != null) {
+                pages = pages.get(version);
+                if (pages != null) {
+                    views = (int) pages.get("pages").getOrDefault(path, 0);
+                    max_views = (int) pages.get("max");
+                }
+            }
+            double absolute_views = Math.log10(views + 1);
+            double relative_views = 0;
+            if (max_views > 0) {
+                relative_views = views/max_views;
+            }
+            double views_weight = (absolute_views + relative_views) * params.views_factor;
+
+            // Combine all weights into a final score.
+            return (ranking + views_weight) * _score;
+        """
+        return {
+            "script": {
+                "source": source,
+                "params": {
+                    "ranking": self._get_ranking(),
+                    "top_pages": self._get_top_pages(),
+                    "views_factor": 1/10,
+                },
+            },
+        }
+
+    def _get_ranking(self):
+        """
+        Get ranking for pages.
 
         ES expects the rank to be a number greater than 0,
         but users can set this between [-10, +10].
         We map that range to [0.01, 2] (21 possible values).
 
-        The first lower rank (0.8) needs to bring the score from the highest boost (sections.title^2)
-        close to the lowest boost (title^1.5), that way exact results take priority:
+        The first lower rank (0.8) needs to bring the score from the highest boost
+        (sections.title^2) close to the lowest boost (title^1.5), that way exact
+        results can still take priority:
 
         - 2.0 * 0.8 = 1.6 (score close to 1.5, but not lower than it)
         - 1.5 * 0.8 = 1.2 (score lower than 1.5)
@@ -399,8 +481,6 @@ def _get_script_score(self):
         - 1.5 * 1.3 = 1.95 (score close to 2.0, but not higher than it)
 
         The next lower and higher ranks need to decrease/increase both scores.
-
-        See https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-script-score-query.html#field-value-factor  # noqa
         """
         ranking = [
             0.01,
@@ -425,15 +505,59 @@ def _get_script_score(self):
             1.96,
             2,
         ]
-        # Each rank maps to a element in the ranking list.
-        # -10 will map to the first element (-10 + 10 = 0) and so on.
-        source = """
-            int rank = doc['rank'].size() == 0 ? 0 : (int) doc['rank'].value;
-            return params.ranking[rank + 10] * _score;
+        return ranking
+
+    def _get_top_pages(self):
         """
-        return {
-            "script": {
-                "source": source,
-                "params": {"ranking": ranking},
-            },
-        }
+        Get the top 100 pages for the versions of the current projects.
+
+        Returns a dictionary with the following structure:
+
+            {
+                'project': {
+                    'version': {
+                        'max': max_views,
+                        'pages': {
+                            'page': views,
+                        },
+                    },
+                },
+            }
+
+        The number of views can be between 0 and 2**31 - 9,
+        this is so we don't overflow when casting the value to an integer
+        inside ES, this also gives us a max value to work on and some space for
+        additional operations.
+        """
+        try:
+            if not self.use_page_views:
+                return {}
+
+            project = self.filter_values['project'][0]
+            version = self.filter_values['version'][0]
+            top_pages_data = PageView.top_viewed_pages(
+                project_slug=project,
+                version_slug=version,
+                top=100,
+            )
+            if not top_pages_data['pages'] or not top_pages_data['view_counts']:
+                return {}
+
+            max_int = 2**31 - 9
+            top_pages_for_version = {
+                page: min(views, max_int)
+                for page, views in zip(top_pages_data['pages'], top_pages_data['view_counts'])
+            }
+            top_pages = {
+                project: {version: {'pages': top_pages_for_version}}
+            }
+
+            # Calculate the max views from each version.
+            for project_data in top_pages.values():
+                for version_data in project_data.values():
+                    pages = version_data['pages']
+                    max_ = max(pages.values())
+                    version_data['max'] = max_
+            return top_pages
+        except (KeyError, IndexError):
+            return {}