readthedocs · stsewd · Jul 15, 2020 · Jul 15, 2020 · Jul 15, 2020 · Jul 15, 2020
@@ -45,9 +45,9 @@ def __str__(self):
         return f'PageView: [{self.project.slug}:{self.version.slug}] - {self.path} for {self.date}'
 
     @classmethod
-    def top_viewed_pages(cls, project, since=None):
+    def top_viewed_pages(cls, project_slug, version_slug=None, top=10, since=None):
         """
-        Returns top 10 pages according to view counts.
+        Returns top N pages according to view counts.
 
         Structure of returned data is compatible to make graphs.
         Sample returned data::
@@ -64,19 +64,26 @@ def top_viewed_pages(cls, project, since=None):
 
         queryset = (
             cls.objects
-            .filter(project=project, date__gte=since)
+            .filter(project__slug=project_slug, date__gte=since)
+        )
+
+        if version_slug:
+            queryset = queryset.filter(version__slug=version_slug)
+
+        queryset = (
+            queryset
             .values_list('path')
             .annotate(total_views=Sum('view_count'))
             .values_list('path', 'total_views')
-            .order_by('-total_views')[:10]
+            .order_by('-total_views')[:top]
         )
 
         pages = []
         view_counts = []
 
-        for data in queryset.iterator():
-            pages.append(data[0])
-            view_counts.append(data[1])
+        for page, views in queryset.iterator():
+            pages.append(page)
+            view_counts.append(views)
 
         final_data = {
             'pages': pages,

@@ -1555,6 +1555,7 @@ def add_features(sender, **kwargs):
     INDEX_FROM_HTML_FILES = 'index_from_html_files'
     DONT_CREATE_INDEX = 'dont_create_index'
     USE_NEW_PIP_RESOLVER = 'use_new_pip_resolver'
+    USE_PAGE_VIEWS_IN_SEARCH_RESULTS = 'use_page_views_in_search_results'
 
     FEATURES = (
         (USE_SPHINX_LATEST, _('Use latest version of Sphinx')),
@@ -1698,6 +1699,10 @@ def add_features(sender, **kwargs):
             USE_NEW_PIP_RESOLVER,
             _('Use new pip resolver'),
         ),
+        (
+            USE_PAGE_VIEWS_IN_SEARCH_RESULTS,
+            _('Weight the number of page views into search results'),
+        ),
     )
 
     projects = models.ManyToManyField(

@@ -1065,7 +1065,7 @@ def get_context_data(self, **kwargs):
         project = self.get_project()
 
         # Count of views for top pages over the month
-        top_pages = PageView.top_viewed_pages(project)
+        top_pages = PageView.top_viewed_pages(project.slug)
         top_viewed_pages = list(zip(
             top_pages['pages'],
             top_pages['view_counts']

@@ -368,6 +368,9 @@ def get_queryset(self):
             # We use a permission class to control authorization
             filter_by_user=False,
             use_advanced_query=not self._get_project().has_feature(Feature.DEFAULT_TO_FUZZY_SEARCH),
+            use_page_views=self._get_project().has_feature(
+                Feature.USE_PAGE_VIEWS_IN_SEARCH_RESULTS,
+            ),
         )
         return queryset
 

@@ -12,6 +12,7 @@
     SimpleQueryString,
 )
 
+from readthedocs.analytics.models import PageView
 from readthedocs.core.utils.extend import SettingsOverrideObject
 from readthedocs.search.documents import PageDocument, ProjectDocument
 
@@ -33,13 +34,24 @@ class RTDFacetedSearch(FacetedSearch):
         'post_tags': ['</span>'],
     }
 
-    def __init__(self, query=None, filters=None, user=None, use_advanced_query=True, **kwargs):
+    def __init__(
+            self,
+            query=None,
+            filters=None,
+            user=None,
+            use_advanced_query=True,
+            use_page_views=False,
+            **kwargs
+    ):
         """
         Pass in a user in order to filter search results by privacy.
 
         If `use_advanced_query` is `True`,
         force to always use `SimpleQueryString` for the text query object.
 
+        If `use_page_views` is `True`,
+        weight page views into the search results.
+
         .. warning::
 
             The `self.user` and `self.filter_by_user` attributes
@@ -48,6 +60,7 @@ def __init__(self, query=None, filters=None, user=None, use_advanced_query=True,
         self.user = user
         self.filter_by_user = kwargs.pop('filter_by_user', True)
         self.use_advanced_query = use_advanced_query
+        self.use_page_views = use_page_views
 
         # Hack a fix to our broken connection pooling
         # This creates a new connection on every request,
@@ -247,14 +260,92 @@ def query(self, search, query):
 
     def _get_script_score(self):
         """
-        Gets an ES script to map the page rank to a valid score weight.
+        Gets an ES script that combines the page rank and views into the final score.
+
+        **Page ranking weight calculation**
+
+        Each rank maps to a element in the ranking list.
+        -10 will map to the first element (-10 + 10 = 0) and so on.
+
+        **Page views weight calculation**
+
+        We calculate two values:
+
+        - absolute: this is equal to ``log10(views + 1)``
+          (we add one since logarithms start at 1).
+          A logarithmic function is a good fit due to its growth rate.
+        - relative: this is equal to the ratio between the number of views of the current page
+          and the max number of views of the current version.
+
+        Those two values are added and multiplied by a weight (``views_factor``).
+
+        .. note::
+
+           We can also make use of the ratio between the number of views
+           and the average of views of the current version.
+
+        **Final score**
+
+        To generate the final score,
+        all weights are added and multiplied by the original score.
+
+        Docs about the script score query and the painless language at:
+
+        - https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-script-score-query.html#field-value-factor  # noqa
+        - https://www.elastic.co/guide/en/elasticsearch/painless/6.8/painless-api-reference.html
+        """
+        source = """
+            // Page ranking weight.
+            int rank = doc['rank'].size() == 0 ? 0 : (int) doc['rank'].value;
+            double ranking = params.ranking[rank + 10];
+
+            // Page views weight.
+            int views = 0;
+            int max_views = 0;
+            String project = doc['project'].value;
+            String version = doc['version'].value;
+            String path = doc['full_path'].value;
+
+            Map pages = params.top_pages.get(project);
+            if (pages != null) {
+                pages = pages.get(version);
+                if (pages != null) {
+                    views = (int) pages.get("pages").getOrDefault(path, 0);
+                    max_views = (int) pages.get("max");
+                }
+            }
+            double absolute_views = Math.log10(views + 1);
+            double relative_views = 0;
+            if (max_views > 0) {
+                relative_views = views/max_views;
+            }
+            double views_weight = (absolute_views + relative_views) * params.views_factor;
+
+            // Combine all weights into a final score.
+            return (ranking + views_weight) * _score;
+        """
+        return {
+            "script": {
+                "source": source,
+                "params": {
+                    "ranking": self._get_ranking(),
+                    "top_pages": self._get_top_pages(),
+                    "views_factor": 1/10,
+                },
+            },
+        }
+
+    def _get_ranking(self):
+        """
+        Get ranking for pages.
 
         ES expects the rank to be a number greater than 0,
         but users can set this between [-10, +10].
         We map that range to [0.01, 2] (21 possible values).
 
-        The first lower rank (0.8) needs to bring the score from the highest boost (sections.title^2)
-        close to the lowest boost (title^1.5), that way exact results take priority:
+        The first lower rank (0.8) needs to bring the score from the highest boost
+        (sections.title^2) close to the lowest boost (title^1.5), that way exact
+        results can still take priority:
 
         - 2.0 * 0.8 = 1.6 (score close to 1.5, but not lower than it)
         - 1.5 * 0.8 = 1.2 (score lower than 1.5)
@@ -266,8 +357,6 @@ def _get_script_score(self):
         - 1.5 * 1.3 = 1.95 (score close to 2.0, but not higher than it)
 
         The next lower and higher ranks need to decrease/increase both scores.
-
-        See https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-script-score-query.html#field-value-factor  # noqa
         """
         ranking = [
             0.01,
@@ -292,18 +381,62 @@ def _get_script_score(self):
             1.96,
             2,
         ]
-        # Each rank maps to a element in the ranking list.
-        # -10 will map to the first element (-10 + 10 = 0) and so on.
-        source = """
-            int rank = doc['rank'].size() == 0 ? 0 : (int) doc['rank'].value;
-            return params.ranking[rank + 10] * _score;
+        return ranking
+
+    def _get_top_pages(self):
         """
-        return {
-            "script": {
-                "source": source,
-                "params": {"ranking": ranking},
-            },
-        }
+        Get the top 100 pages for the versions of the current projects.
+
+        Returns a dictionary with the following structure:
+
+            {
+                'project': {
+                    'version': {
+                        'max': max_views,
+                        'pages': {
+                            'page': views,
+                        },
+                    },
+                },
+            }
+
+        The number of views can be between 0 and 2**31 - 9,
+        this is so we don't overflow when casting the value to an integer
+        inside ES, this also gives us a max value to work on and some space for
+        additional operations.
+        """
+        try:
+            if not self.use_page_views:
+                return {}
+
+            project = self.filter_values['project'][0]
+            version = self.filter_values['version'][0]
+            top_pages_data = PageView.top_viewed_pages(
+                project_slug=project,
+                version_slug=version,
+                top=100,
+            )
+            if not top_pages_data['pages'] or not top_pages_data['view_counts']:
+                return {}
+
+            max_int = 2**31 - 9
+            top_pages = {
+                page: min(views, max_int)
+                for page, views in zip(top_pages_data['pages'], top_pages_data['view_counts'])
+            }
+            top_pages = {
+                project: {version: {'pages': top_pages}}
+            }
+
+            # Calculate the max views from each version.
+            for project_data in top_pages.values():
+                for version_data in project_data.values():
+                    pages = version_data['pages']
+                    max_ = max(pages.values())
+                    version_data['max'] = max_
+            return top_pages
+        except (KeyError, IndexError):
+            return {}
 
     def generate_nested_query(self, query, path, fields, inner_hits):
         """Generate a nested query with passed parameters."""