Skip to content

Commit 7f77f3f

Browse files
committed
Search: weight page views into search results
Closes #5968
1 parent 552becc commit 7f77f3f

File tree

2 files changed

+136
-21
lines changed

2 files changed

+136
-21
lines changed

readthedocs/analytics/models.py

+14-7
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,9 @@ def __str__(self):
4545
return f'PageView: [{self.project.slug}:{self.version.slug}] - {self.path} for {self.date}'
4646

4747
@classmethod
48-
def top_viewed_pages(cls, project, since=None):
48+
def top_viewed_pages(cls, project_slug, version_slug=None, top=10, since=None):
4949
"""
50-
Returns top 10 pages according to view counts.
50+
Returns top N pages according to view counts.
5151
5252
Structure of returned data is compatible to make graphs.
5353
Sample returned data::
@@ -64,19 +64,26 @@ def top_viewed_pages(cls, project, since=None):
6464

6565
queryset = (
6666
cls.objects
67-
.filter(project=project, date__gte=since)
67+
.filter(project__slug=project_slug, date__gte=since)
68+
)
69+
70+
if version_slug:
71+
queryset = queryset.filter(version__slug=version_slug)
72+
73+
queryset = (
74+
queryset
6875
.values_list('path')
6976
.annotate(total_views=Sum('view_count'))
7077
.values_list('path', 'total_views')
71-
.order_by('-total_views')[:10]
78+
.order_by('-total_views')[:top]
7279
)
7380

7481
pages = []
7582
view_counts = []
7683

77-
for data in queryset.iterator():
78-
pages.append(data[0])
79-
view_counts.append(data[1])
84+
for page, views in queryset.iterator():
85+
pages.append(page)
86+
view_counts.append(views)
8087

8188
final_data = {
8289
'pages': pages,

readthedocs/search/faceted_search.py

+122-14
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
SimpleQueryString,
1313
)
1414

15+
from readthedocs.analytics.models import PageView
1516
from readthedocs.core.utils.extend import SettingsOverrideObject
1617
from readthedocs.search.documents import PageDocument, ProjectDocument
1718

@@ -247,7 +248,79 @@ def query(self, search, query):
247248

248249
def _get_script_score(self):
249250
"""
250-
Gets an ES script to map the page rank to a valid score weight.
251+
Gets an ES script that combines the page rank and views into the final score.
252+
253+
**Page ranking weight calculation**
254+
255+
Each rank maps to a element in the ranking list.
256+
-10 will map to the first element (-10 + 10 = 0) and so on.
257+
258+
**Page views weight calculation**
259+
260+
We calculate two values:
261+
262+
- absolute: this is equal to ``log10(views + 1)``
263+
(we add one since logarithms start at 1).
264+
A logarithmic function is a good fit due to its growth rate.
265+
- relative: this is equal to ``views/max_views``,
266+
where ``max_views`` is the max value from al page views from that version.
267+
268+
Those two values are added and multiplied by a weight (``views_factor``).
269+
270+
**Final score**
271+
272+
To generate the final score,
273+
all weights are added and multiplied by the original score.
274+
275+
Docs about the script score query and the painless language at:
276+
277+
- https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-script-score-query.html#field-value-factor # noqa
278+
- https://www.elastic.co/guide/en/elasticsearch/painless/6.8/painless-api-reference.html
279+
"""
280+
source = """
281+
// Page ranking weight.
282+
int rank = doc['rank'].size() == 0 ? 0 : (int) doc['rank'].value;
283+
double ranking = params.ranking[rank + 10];
284+
285+
// Page views weight.
286+
int views = 0;
287+
int max_views = 0;
288+
String project = doc['project'].value;
289+
String version = doc['version'].value;
290+
String path = doc['full_path'].value;
291+
292+
Map pages = params.top_pages.get(project);
293+
if (pages != null) {
294+
pages = pages.get(version);
295+
if (pages != null) {
296+
views = (int) pages.get("pages").getOrDefault(path, 0);
297+
max_views = (int) pages.get("max");
298+
}
299+
}
300+
double absolute_views = Math.log10(views + 1);
301+
double relative_views = 0;
302+
if (max_views > 0) {
303+
relative_views = views/max_views;
304+
}
305+
double views_weight = (absolute_views + relative_views) * params.views_factor;
306+
307+
// Combine all weights into a final score
308+
return (ranking + views_weight) * _score;
309+
"""
310+
return {
311+
"script": {
312+
"source": source,
313+
"params": {
314+
"ranking": self._get_ranking(),
315+
"top_pages": self._get_top_pages(),
316+
"views_factor": 1/10,
317+
},
318+
},
319+
}
320+
321+
def _get_ranking(self):
322+
"""
323+
Get ranking for pages.
251324
252325
ES expects the rank to be a number greater than 0,
253326
but users can set this between [-10, +10].
@@ -266,8 +339,6 @@ def _get_script_score(self):
266339
- 1.5 * 1.3 = 1.95 (score close to 2.0, but not higher than it)
267340
268341
The next lower and higher ranks need to decrease/increase both scores.
269-
270-
See https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-script-score-query.html#field-value-factor # noqa
271342
"""
272343
ranking = [
273344
0.01,
@@ -292,18 +363,55 @@ def _get_script_score(self):
292363
1.96,
293364
2,
294365
]
295-
# Each rank maps to a element in the ranking list.
296-
# -10 will map to the first element (-10 + 10 = 0) and so on.
297-
source = """
298-
int rank = doc['rank'].size() == 0 ? 0 : (int) doc['rank'].value;
299-
return params.ranking[rank + 10] * _score;
366+
return ranking
367+
368+
def _get_top_pages(self):
300369
"""
301-
return {
302-
"script": {
303-
"source": source,
304-
"params": {"ranking": ranking},
305-
},
306-
}
370+
Get the top 100 pages for the versions of the current projects.
371+
372+
Returns a dictionary with the following structure:
373+
374+
{
375+
'project': {
376+
'version': {
377+
'max': max_views,
378+
'pages': {
379+
'page': views,
380+
},
381+
},
382+
},
383+
}
384+
385+
The number of views can be between 0 and 2**31 - 9,
386+
this is so we don't overflow when casting the value to an integer
387+
inside ES, this also gives us a max value to work on and some space for
388+
additional operations.
389+
"""
390+
try:
391+
project = self.filter_values['project'][0]
392+
version = self.filter_values['version'][0]
393+
top_pages_data = PageView.top_viewed_pages(
394+
project_slug=project,
395+
version_slug=version,
396+
top=100,
397+
)
398+
max_int = 2**31 - 9
399+
top_pages = {
400+
page: min(views, max_int)
401+
for page, views in zip(top_pages_data['pages'], top_pages_data['view_counts'])
402+
}
403+
top_pages = {
404+
project: {version: {'pages': top_pages}}
405+
}
406+
407+
# Calculate the max views from each version.
408+
for project_data in top_pages.values():
409+
for version_data in project_data.values():
410+
max_ = max(version_data['pages'].values())
411+
version_data['max'] = max_
412+
return top_pages
413+
except (KeyError, IndexError):
414+
return {}
307415

308416
def generate_nested_query(self, query, path, fields, inner_hits):
309417
"""Generate a nested query with passed parameters."""

0 commit comments

Comments
 (0)