12
12
SimpleQueryString ,
13
13
)
14
14
15
+ from readthedocs .analytics .models import PageView
15
16
from readthedocs .core .utils .extend import SettingsOverrideObject
16
17
from readthedocs .search .documents import PageDocument , ProjectDocument
17
18
@@ -247,7 +248,79 @@ def query(self, search, query):
247
248
248
249
def _get_script_score (self ):
249
250
"""
250
- Gets an ES script to map the page rank to a valid score weight.
251
+ Gets an ES script that combines the page rank and views into the final score.
252
+
253
+ **Page ranking weight calculation**
254
+
255
+ Each rank maps to a element in the ranking list.
256
+ -10 will map to the first element (-10 + 10 = 0) and so on.
257
+
258
+ **Page views weight calculation**
259
+
260
+ We calculate two values:
261
+
262
+ - absolute: this is equal to ``log10(views + 1)``
263
+ (we add one since logarithms start at 1).
264
+ A logarithmic function is a good fit due to its growth rate.
265
+ - relative: this is equal to ``views/max_views``,
266
+ where ``max_views`` is the max value from al page views from that version.
267
+
268
+ Those two values are added and multiplied by a weight (``views_factor``).
269
+
270
+ **Final score**
271
+
272
+ To generate the final score,
273
+ all weights are added and multiplied by the original score.
274
+
275
+ Docs about the script score query and the painless language at:
276
+
277
+ - https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-script-score-query.html#field-value-factor # noqa
278
+ - https://www.elastic.co/guide/en/elasticsearch/painless/6.8/painless-api-reference.html
279
+ """
280
+ source = """
281
+ // Page ranking weight.
282
+ int rank = doc['rank'].size() == 0 ? 0 : (int) doc['rank'].value;
283
+ double ranking = params.ranking[rank + 10];
284
+
285
+ // Page views weight.
286
+ int views = 0;
287
+ int max_views = 0;
288
+ String project = doc['project'].value;
289
+ String version = doc['version'].value;
290
+ String path = doc['full_path'].value;
291
+
292
+ Map pages = params.top_pages.get(project);
293
+ if (pages != null) {
294
+ pages = pages.get(version);
295
+ if (pages != null) {
296
+ views = (int) pages.get("pages").getOrDefault(path, 0);
297
+ max_views = (int) pages.get("max");
298
+ }
299
+ }
300
+ double absolute_views = Math.log10(views + 1);
301
+ double relative_views = 0;
302
+ if (max_views > 0) {
303
+ relative_views = views/max_views;
304
+ }
305
+ double views_weight = (absolute_views + relative_views) * params.views_factor;
306
+
307
+ // Combine all weights into a final score
308
+ return (ranking + views_weight) * _score;
309
+ """
310
+ return {
311
+ "script" : {
312
+ "source" : source ,
313
+ "params" : {
314
+ "ranking" : self ._get_ranking (),
315
+ "top_pages" : self ._get_top_pages (),
316
+ "views_factor" : 1 / 10 ,
317
+ },
318
+ },
319
+ }
320
+
321
+ def _get_ranking (self ):
322
+ """
323
+ Get ranking for pages.
251
324
252
325
ES expects the rank to be a number greater than 0,
253
326
but users can set this between [-10, +10].
@@ -266,8 +339,6 @@ def _get_script_score(self):
266
339
- 1.5 * 1.3 = 1.95 (score close to 2.0, but not higher than it)
267
340
268
341
The next lower and higher ranks need to decrease/increase both scores.
269
-
270
- See https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-script-score-query.html#field-value-factor # noqa
271
342
"""
272
343
ranking = [
273
344
0.01 ,
@@ -292,18 +363,55 @@ def _get_script_score(self):
292
363
1.96 ,
293
364
2 ,
294
365
]
295
- # Each rank maps to a element in the ranking list.
296
- # -10 will map to the first element (-10 + 10 = 0) and so on.
297
- source = """
298
- int rank = doc['rank'].size() == 0 ? 0 : (int) doc['rank'].value;
299
- return params.ranking[rank + 10] * _score;
366
+ return ranking
367
+
368
+ def _get_top_pages (self ):
300
369
"""
301
- return {
302
- "script" : {
303
- "source" : source ,
304
- "params" : {"ranking" : ranking },
305
- },
306
- }
370
+ Get the top 100 pages for the versions of the current projects.
371
+
372
+ Returns a dictionary with the following structure:
373
+
374
+ {
375
+ 'project': {
376
+ 'version': {
377
+ 'max': max_views,
378
+ 'pages': {
379
+ 'page': views,
380
+ },
381
+ },
382
+ },
383
+ }
384
+
385
+ The number of views can be between 0 and 2**31 - 9,
386
+ this is so we don't overflow when casting the value to an integer
387
+ inside ES, this also gives us a max value to work on and some space for
388
+ additional operations.
389
+ """
390
+ try :
391
+ project = self .filter_values ['project' ][0 ]
392
+ version = self .filter_values ['version' ][0 ]
393
+ top_pages_data = PageView .top_viewed_pages (
394
+ project_slug = project ,
395
+ version_slug = version ,
396
+ top = 100 ,
397
+ )
398
+ max_int = 2 ** 31 - 9
399
+ top_pages = {
400
+ page : min (views , max_int )
401
+ for page , views in zip (top_pages_data ['pages' ], top_pages_data ['view_counts' ])
402
+ }
403
+ top_pages = {
404
+ project : {version : {'pages' : top_pages }}
405
+ }
406
+
407
+ # Calculate the max views from each version.
408
+ for project_data in top_pages .values ():
409
+ for version_data in project_data .values ():
410
+ max_ = max (version_data ['pages' ].values ())
411
+ version_data ['max' ] = max_
412
+ return top_pages
413
+ except (KeyError , IndexError ):
414
+ return {}
307
415
308
416
def generate_nested_query (self , query , path , fields , inner_hits ):
309
417
"""Generate a nested query with passed parameters."""
0 commit comments