-
-
Notifications
You must be signed in to change notification settings - Fork 3.6k
Search: weight page views into search results #7297
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 16 commits
f9ce9ca
c27e4df
d267903
70ad4b8
65fcc7c
2e5aa58
552becc
7f77f3f
f21def6
67e95db
5981d08
cb57d01
d781976
944cf1b
cbdc129
3249b27
53ca222
054251c
558380c
4f53189
9790d41
93075d0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,7 @@ | |
SimpleQueryString, | ||
) | ||
|
||
from readthedocs.analytics.models import PageView | ||
from readthedocs.core.utils.extend import SettingsOverrideObject | ||
from readthedocs.search.documents import PageDocument, ProjectDocument | ||
|
||
|
@@ -33,13 +34,24 @@ class RTDFacetedSearch(FacetedSearch): | |
'post_tags': ['</span>'], | ||
} | ||
|
||
def __init__(self, query=None, filters=None, user=None, use_advanced_query=True, **kwargs): | ||
def __init__( | ||
self, | ||
query=None, | ||
filters=None, | ||
user=None, | ||
use_advanced_query=True, | ||
use_page_views=False, | ||
**kwargs | ||
): | ||
""" | ||
Pass in a user in order to filter search results by privacy. | ||
|
||
If `use_advanced_query` is `True`, | ||
force to always use `SimpleQueryString` for the text query object. | ||
|
||
If `use_page_views` is `True`, | ||
weight page views into the search results. | ||
|
||
.. warning:: | ||
|
||
The `self.user` and `self.filter_by_user` attributes | ||
|
@@ -48,6 +60,7 @@ def __init__(self, query=None, filters=None, user=None, use_advanced_query=True, | |
self.user = user | ||
self.filter_by_user = kwargs.pop('filter_by_user', True) | ||
self.use_advanced_query = use_advanced_query | ||
self.use_page_views = use_page_views | ||
|
||
# Hack a fix to our broken connection pooling | ||
# This creates a new connection on every request, | ||
|
@@ -247,14 +260,92 @@ def query(self, search, query): | |
|
||
def _get_script_score(self): | ||
""" | ||
Gets an ES script to map the page rank to a valid score weight. | ||
Gets an ES script that combines the page rank and views into the final score. | ||
|
||
**Page ranking weight calculation** | ||
|
||
Each rank maps to a element in the ranking list. | ||
stsewd marked this conversation as resolved.
Show resolved
Hide resolved
|
||
-10 will map to the first element (-10 + 10 = 0) and so on. | ||
|
||
**Page views weight calculation** | ||
|
||
We calculate two values: | ||
|
||
- absolute: this is equal to ``log10(views + 1)`` | ||
(we add one since logarithms start at 1). | ||
A logarithmic function is a good fit due to its growth rate. | ||
Comment on lines
+398
to
+400
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This absolute value is across all the versions for the same page? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nope, here views is the number of views from that page in that version, this value isn't compared to anything (that's why it's absolute). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So, if we have a page If that is correct, won't we be pointing the user always to old pages because they will have more views than the up-to-date version of the exact same page? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar example searching for the term There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So, I avoided merging versions since the content can be different, I think we should keep versions isolated. Another example is if a new api/v3.html is published, we don't want to list the results from the old api first. |
||
- relative: this is equal to the ratio between the number of views of the current page | ||
and the max number of views of the current version. | ||
|
||
Those two values are added and multiplied by a weight (``views_factor``). | ||
|
||
.. note:: | ||
|
||
We can also make use of the ratio between the number of views | ||
and the average of views of the current version. | ||
|
||
**Final score** | ||
|
||
To generate the final score, | ||
all weights are added and multiplied by the original score. | ||
|
||
Docs about the script score query and the painless language at: | ||
|
||
- https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-script-score-query.html#field-value-factor # noqa | ||
- https://www.elastic.co/guide/en/elasticsearch/painless/6.8/painless-api-reference.html | ||
""" | ||
source = """ | ||
// Page ranking weight. | ||
int rank = doc['rank'].size() == 0 ? 0 : (int) doc['rank'].value; | ||
double ranking = params.ranking[rank + 10]; | ||
|
||
// Page views weight. | ||
int views = 0; | ||
int max_views = 0; | ||
String project = doc['project'].value; | ||
String version = doc['version'].value; | ||
String path = doc['full_path'].value; | ||
|
||
Map pages = params.top_pages.get(project); | ||
if (pages != null) { | ||
pages = pages.get(version); | ||
if (pages != null) { | ||
views = (int) pages.get("pages").getOrDefault(path, 0); | ||
max_views = (int) pages.get("max"); | ||
} | ||
} | ||
double absolute_views = Math.log10(views + 1); | ||
double relative_views = 0; | ||
if (max_views > 0) { | ||
relative_views = views/max_views; | ||
} | ||
double views_weight = (absolute_views + relative_views) * params.views_factor; | ||
|
||
// Combine all weights into a final score. | ||
return (ranking + views_weight) * _score; | ||
""" | ||
return { | ||
"script": { | ||
"source": source, | ||
"params": { | ||
"ranking": self._get_ranking(), | ||
"top_pages": self._get_top_pages(), | ||
"views_factor": 1/10, | ||
}, | ||
}, | ||
} | ||
|
||
def _get_ranking(self): | ||
""" | ||
Get ranking for pages. | ||
|
||
ES expects the rank to be a number greater than 0, | ||
but users can set this between [-10, +10]. | ||
We map that range to [0.01, 2] (21 possible values). | ||
|
||
The first lower rank (0.8) needs to bring the score from the highest boost (sections.title^2) | ||
close to the lowest boost (title^1.5), that way exact results take priority: | ||
The first lower rank (0.8) needs to bring the score from the highest boost | ||
(sections.title^2) close to the lowest boost (title^1.5), that way exact | ||
results can still take priority: | ||
|
||
- 2.0 * 0.8 = 1.6 (score close to 1.5, but not lower than it) | ||
- 1.5 * 0.8 = 1.2 (score lower than 1.5) | ||
|
@@ -266,8 +357,6 @@ def _get_script_score(self): | |
- 1.5 * 1.3 = 1.95 (score close to 2.0, but not higher than it) | ||
|
||
The next lower and higher ranks need to decrease/increase both scores. | ||
|
||
See https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-script-score-query.html#field-value-factor # noqa | ||
""" | ||
ranking = [ | ||
0.01, | ||
|
@@ -292,18 +381,62 @@ def _get_script_score(self): | |
1.96, | ||
2, | ||
] | ||
# Each rank maps to a element in the ranking list. | ||
# -10 will map to the first element (-10 + 10 = 0) and so on. | ||
source = """ | ||
int rank = doc['rank'].size() == 0 ? 0 : (int) doc['rank'].value; | ||
return params.ranking[rank + 10] * _score; | ||
return ranking | ||
|
||
def _get_top_pages(self): | ||
""" | ||
return { | ||
"script": { | ||
"source": source, | ||
"params": {"ranking": ranking}, | ||
}, | ||
} | ||
Get the top 100 pages for the versions of the current projects. | ||
|
||
Returns a dictionary with the following structure: | ||
|
||
{ | ||
'project': { | ||
'version': { | ||
'max': max_views, | ||
'pages': { | ||
'page': views, | ||
}, | ||
}, | ||
}, | ||
} | ||
|
||
The number of views can be between 0 and 2**31 - 9, | ||
this is so we don't overflow when casting the value to an integer | ||
inside ES, this also gives us a max value to work on and some space for | ||
additional operations. | ||
""" | ||
try: | ||
if not self.use_page_views: | ||
return {} | ||
|
||
project = self.filter_values['project'][0] | ||
version = self.filter_values['version'][0] | ||
top_pages_data = PageView.top_viewed_pages( | ||
project_slug=project, | ||
version_slug=version, | ||
top=100, | ||
) | ||
if not top_pages_data['pages'] or not top_pages_data['view_counts']: | ||
return {} | ||
|
||
max_int = 2**31 - 9 | ||
top_pages = { | ||
stsewd marked this conversation as resolved.
Show resolved
Hide resolved
|
||
page: min(views, max_int) | ||
for page, views in zip(top_pages_data['pages'], top_pages_data['view_counts']) | ||
} | ||
top_pages = { | ||
project: {version: {'pages': top_pages}} | ||
} | ||
|
||
# Calculate the max views from each version. | ||
for project_data in top_pages.values(): | ||
for version_data in project_data.values(): | ||
pages = version_data['pages'] | ||
max_ = max(pages.values()) | ||
version_data['max'] = max_ | ||
return top_pages | ||
except (KeyError, IndexError): | ||
return {} | ||
|
||
def generate_nested_query(self, query, path, fields, inner_hits): | ||
"""Generate a nested query with passed parameters.""" | ||
|
Uh oh!
There was an error while loading. Please reload this page.