Skip to content

Search: weight page views into search results #7297

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 22 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions readthedocs/analytics/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def __str__(self):
return f'PageView: [{self.project.slug}:{self.version.slug}] - {self.path} for {self.date}'

@classmethod
def top_viewed_pages(cls, project, since=None, limit=10):
def top_viewed_pages(cls, project_slug, version_slug=None, since=None, limit=10):
"""
Returns top pages according to view counts.

Expand All @@ -64,8 +64,14 @@ def top_viewed_pages(cls, project, since=None, limit=10):

queryset = (
cls.objects
.filter(project=project, date__gte=since)
.values_list('path')
.filter(project__slug=project_slug, date__gte=since)
)

if version_slug:
queryset = queryset.filter(version__slug=version_slug)

queryset = (
queryset
.annotate(total_views=Sum('view_count'))
.values_list('path', 'total_views')
.order_by('-total_views')[:limit]
Expand All @@ -74,9 +80,9 @@ def top_viewed_pages(cls, project, since=None, limit=10):
pages = []
view_counts = []

for data in queryset.iterator():
pages.append(data[0])
view_counts.append(data[1])
for page, views in queryset.iterator():
pages.append(page)
view_counts.append(views)

final_data = {
'pages': pages,
Expand Down
5 changes: 5 additions & 0 deletions readthedocs/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1606,6 +1606,7 @@ def add_features(sender, **kwargs):
ENABLE_MKDOCS_SERVER_SIDE_SEARCH = 'enable_mkdocs_server_side_search'
DEFAULT_TO_FUZZY_SEARCH = 'default_to_fuzzy_search'
INDEX_FROM_HTML_FILES = 'index_from_html_files'
USE_PAGE_VIEWS_IN_SEARCH_RESULTS = 'use_page_views_in_search_results'

LIST_PACKAGES_INSTALLED_ENV = 'list_packages_installed_env'
VCS_REMOTE_LISTING = 'vcs_remote_listing'
Expand Down Expand Up @@ -1719,6 +1720,10 @@ def add_features(sender, **kwargs):
INDEX_FROM_HTML_FILES,
_('Index content directly from html files instead or relying in other sources'),
),
(
USE_PAGE_VIEWS_IN_SEARCH_RESULTS,
_('Weight the number of page views into search results'),
),

(
LIST_PACKAGES_INSTALLED_ENV,
Expand Down
14 changes: 6 additions & 8 deletions readthedocs/projects/views/private.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,11 @@
from django.http import (
Http404,
HttpResponseBadRequest,
HttpResponseNotAllowed,
HttpResponseRedirect,
StreamingHttpResponse,
)
from django.middleware.csrf import get_token
from django.shortcuts import get_object_or_404, render
from django.shortcuts import get_object_or_404
from django.urls import reverse
from django.utils import timezone
from django.utils.safestring import mark_safe
Expand All @@ -40,10 +39,7 @@
Version,
VersionAutomationRule,
)
from readthedocs.core.mixins import (
ListViewWithForm,
PrivateViewMixin,
)
from readthedocs.core.mixins import ListViewWithForm, PrivateViewMixin
from readthedocs.core.utils import trigger_build
from readthedocs.core.utils.extend import SettingsOverrideObject
from readthedocs.integrations.models import HttpExchange, Integration
Expand Down Expand Up @@ -72,7 +68,6 @@
Domain,
EmailHook,
EnvironmentVariable,
Feature,
Project,
ProjectRelationship,
WebHook,
Expand Down Expand Up @@ -1129,7 +1124,10 @@ def get_context_data(self, **kwargs):
return context

# Count of views for top pages over the month
top_pages = PageView.top_viewed_pages(project, limit=25)
top_pages = PageView.top_viewed_pages(
project_slug=project.slug,
limit=25,
)
top_viewed_pages = list(zip(
top_pages['pages'],
top_pages['view_counts']
Expand Down
3 changes: 3 additions & 0 deletions readthedocs/search/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,9 @@ def get_queryset(self):
projects=projects,
aggregate_results=False,
use_advanced_query=not main_project.has_feature(Feature.DEFAULT_TO_FUZZY_SEARCH),
use_page_views=main_project.has_feature(
Feature.USE_PAGE_VIEWS_IN_SEARCH_RESULTS,
),
)
return queryset

Expand Down
158 changes: 141 additions & 17 deletions readthedocs/search/faceted_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
Wildcard,
)

from readthedocs.analytics.models import PageView
from readthedocs.search.documents import PageDocument, ProjectDocument

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -48,6 +49,7 @@ def __init__(
projects=None,
aggregate_results=True,
use_advanced_query=True,
use_page_views=False,
**kwargs,
):
"""
Expand All @@ -58,8 +60,9 @@ def __init__(
:param projects: A dictionary of project slugs mapped to a `VersionData` object.
Or a list of project slugs.
Results are filter with these values.
:param use_advanced_query: If `True` forces to always use
:param bool use_advanced_query: If `True` forces to always use
`SimpleQueryString` for the text query object.
:param bool use_page_views: Is `True`, weight page views into the search results.
:param bool aggregate_results: If results should be aggregated,
this is returning the number of results within other facets.
:param bool use_advanced_query: Always use SimpleQueryString.
Expand All @@ -68,6 +71,7 @@ def __init__(
self.use_advanced_query = use_advanced_query
self.aggregate_results = aggregate_results
self.projects = projects or {}
self.use_page_views = use_page_views

# Hack a fix to our broken connection pooling
# This creates a new connection on every request,
Expand Down Expand Up @@ -380,14 +384,92 @@ def _get_nested_query(self, *, query, path, fields):

def _get_script_score(self):
"""
Gets an ES script to map the page rank to a valid score weight.
Gets an ES script that combines the page rank and views into the final score.

**Page ranking weight calculation**

Each rank maps to an element in the ranking list.
-10 will map to the first element (-10 + 10 = 0) and so on.

**Page views weight calculation**

We calculate two values:

- absolute: this is equal to ``log10(views + 1)``
(we add one since logarithms start at 1).
A logarithmic function is a good fit due to its growth rate.
Comment on lines +398 to +400
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This absolute value is across all the versions for the same page?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope, here views is the number of views from that page in that version, this value isn't compared to anything (that's why it's absolute).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, if we have a page /api/v2.html that has a lot of views in version 1.0, then a new version of the docs is published, 1.1; the new page /1.1/api/v2.html won't be shown first in the results because it will have less views?

If that is correct, won't we be pointing the user always to old pages because they will have more views than the up-to-date version of the exact same page?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar example searching for the term API in 1.1 version of the docs. It may refers to another page (i.e. configuration.html) instead of api/v2.html because the newer version of configuration.html has more views than the newer version of api/v2.html --but historically (old versions) api/v2.hml has a lot more views. Makes sense?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, I avoided merging versions since the content can be different, I think we should keep versions isolated. Another example is if a new api/v3.html is published, we don't want to list the results from the old api first.

- relative: this is equal to the ratio between the number of views of the current page
and the max number of views of the current version.

Those two values are added and multiplied by a weight (``views_factor``).

.. note::

We can also make use of the ratio between the number of views
and the average of views of the current version.

**Final score**

To generate the final score,
all weights are added and multiplied by the original score.

Docs about the script score query and the painless language at:

- https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-script-score-query.html#field-value-factor # noqa
- https://www.elastic.co/guide/en/elasticsearch/painless/6.8/painless-api-reference.html
"""
source = """
// Page ranking weight.
int rank = doc['rank'].size() == 0 ? 0 : (int) doc['rank'].value;
double ranking = params.ranking[rank + 10];

// Page views weight.
int views = 0;
int max_views = 0;
String project = doc['project'].value;
String version = doc['version'].value;
String path = doc['full_path'].value;

Map pages = params.top_pages.get(project);
if (pages != null) {
pages = pages.get(version);
if (pages != null) {
views = (int) pages.get("pages").getOrDefault(path, 0);
max_views = (int) pages.get("max");
}
}
double absolute_views = Math.log10(views + 1);
double relative_views = 0;
if (max_views > 0) {
relative_views = views/max_views;
}
double views_weight = (absolute_views + relative_views) * params.views_factor;

// Combine all weights into a final score.
return (ranking + views_weight) * _score;
"""
return {
"script": {
"source": source,
"params": {
"ranking": self._get_ranking(),
"top_pages": self._get_top_pages(),
"views_factor": 1/10,
},
},
}

def _get_ranking(self):
"""
Get ranking for pages.

ES expects the rank to be a number greater than 0,
but users can set this between [-10, +10].
We map that range to [0.01, 2] (21 possible values).

The first lower rank (0.8) needs to bring the score from the highest boost (sections.title^2)
close to the lowest boost (title^1.5), that way exact results take priority:
The first lower rank (0.8) needs to bring the score from the highest boost
(sections.title^2) close to the lowest boost (title^1.5), that way exact
results can still take priority:

- 2.0 * 0.8 = 1.6 (score close to 1.5, but not lower than it)
- 1.5 * 0.8 = 1.2 (score lower than 1.5)
Expand All @@ -399,8 +481,6 @@ def _get_script_score(self):
- 1.5 * 1.3 = 1.95 (score close to 2.0, but not higher than it)

The next lower and higher ranks need to decrease/increase both scores.

See https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-script-score-query.html#field-value-factor # noqa
"""
ranking = [
0.01,
Expand All @@ -425,15 +505,59 @@ def _get_script_score(self):
1.96,
2,
]
# Each rank maps to a element in the ranking list.
# -10 will map to the first element (-10 + 10 = 0) and so on.
source = """
int rank = doc['rank'].size() == 0 ? 0 : (int) doc['rank'].value;
return params.ranking[rank + 10] * _score;
return ranking

def _get_top_pages(self):
"""
return {
"script": {
"source": source,
"params": {"ranking": ranking},
},
}
Get the top 100 pages for the versions of the current projects.

Returns a dictionary with the following structure:

{
'project': {
'version': {
'max': max_views,
'pages': {
'page': views,
},
},
},
}

The number of views can be between 0 and 2**31 - 9,
this is so we don't overflow when casting the value to an integer
inside ES, this also gives us a max value to work on and some space for
additional operations.
"""
try:
if not self.use_page_views:
return {}

project = self.filter_values['project'][0]
version = self.filter_values['version'][0]
top_pages_data = PageView.top_viewed_pages(
project_slug=project,
version_slug=version,
top=100,
)
if not top_pages_data['pages'] or not top_pages_data['view_counts']:
return {}

max_int = 2**31 - 9
top_pages_for_version = {
page: min(views, max_int)
for page, views in zip(top_pages_data['pages'], top_pages_data['view_counts'])
}
top_pages = {
project: {version: {'pages': top_pages_for_version}}
}

# Calculate the max views from each version.
for project_data in top_pages.values():
for version_data in project_data.values():
pages = version_data['pages']
max_ = max(pages.values())
version_data['max'] = max_
return top_pages
except (KeyError, IndexError):
return {}
Loading