Skip to content

Commit 96fc90b

Browse files
authored
Analytics: don't record page views for PR previews (#11065)
* Analytics: don't record page views for PR previews * Tests * Fix log * Comments
1 parent 7fcd568 commit 96fc90b

File tree

6 files changed

+109
-31
lines changed

6 files changed

+109
-31
lines changed

readthedocs/analytics/models.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -25,25 +25,25 @@ class PageViewManager(models.Manager):
2525

2626
"""Manager for PageView model."""
2727

28-
def register_page_view(self, project, version, path, full_path, status):
28+
def register_page_view(self, project, version, filename, path, status):
2929
"""Track page view with the given parameters."""
3030
# TODO: remove after the migration of duplicate records has been completed.
3131
if project.has_feature(Feature.DISABLE_PAGEVIEWS):
3232
return
3333

3434
# Normalize paths to avoid duplicates.
35+
filename = "/" + filename.lstrip("/")
3536
path = "/" + path.lstrip("/")
36-
full_path = "/" + full_path.lstrip("/")
3737

3838
page_view, created = self.get_or_create(
3939
project=project,
4040
version=version,
41-
path=path,
41+
path=filename,
4242
date=timezone.now().date(),
4343
status=status,
4444
defaults={
4545
"view_count": 1,
46-
"full_path": full_path,
46+
"full_path": path,
4747
},
4848
)
4949
if not created:

readthedocs/analytics/proxied_api.py

+19-5
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from functools import lru_cache
33
from urllib.parse import urlparse
44

5+
import structlog
56
from django.http import JsonResponse
67
from django.shortcuts import get_object_or_404
78
from rest_framework import status
@@ -13,8 +14,10 @@
1314
from readthedocs.core.mixins import CDNCacheControlMixin
1415
from readthedocs.core.unresolver import UnresolverError, unresolve
1516
from readthedocs.core.utils.extend import SettingsOverrideObject
17+
from readthedocs.core.utils.requests import is_suspicious_request
1618
from readthedocs.projects.models import Project
1719

20+
log = structlog.get_logger(__name__) # noqa
1821

1922
class BaseAnalyticsView(CDNCacheControlMixin, APIView):
2023

@@ -51,6 +54,7 @@ def _get_version(self):
5154
return version
5255

5356
def get(self, request, *args, **kwargs):
57+
# TODO: Use absolute_uri only, we don't need project and version.
5458
project = self._get_project()
5559
version = self._get_version()
5660
absolute_uri = self.request.GET.get("absolute_uri")
@@ -69,24 +73,34 @@ def get(self, request, *args, **kwargs):
6973

7074
def increase_page_view_count(self, project, version, absolute_uri):
7175
"""Increase the page view count for the given project."""
76+
if is_suspicious_request(self.request):
77+
log.info(
78+
"Suspicious request, not recording pageview.",
79+
url=absolute_uri,
80+
)
81+
return
82+
83+
# Don't allow tracking page views from external domains.
84+
if self.request.unresolved_domain.is_from_external_domain:
85+
return
86+
7287
try:
7388
unresolved = unresolve(absolute_uri)
7489
except UnresolverError:
7590
# If we were unable to resolve the URL, it
7691
# isn't pointing to a valid RTD project.
7792
return
7893

79-
if not unresolved.filename:
94+
# Don't track external versions.
95+
if version.is_external or not unresolved.filename:
8096
return
8197

82-
path = unresolved.filename
83-
full_path = urlparse(absolute_uri).path
84-
98+
path = urlparse(absolute_uri).path
8599
PageView.objects.register_page_view(
86100
project=project,
87101
version=version,
102+
filename=unresolved.filename,
88103
path=path,
89-
full_path=full_path,
90104
status=200,
91105
)
92106

readthedocs/analytics/tests.py

+18-1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from django.utils import timezone
77
from django_dynamic_fixture import get
88

9+
from readthedocs.builds.constants import EXTERNAL
910
from readthedocs.builds.models import Version
1011
from readthedocs.projects.constants import PUBLIC
1112
from readthedocs.projects.models import Project
@@ -99,7 +100,9 @@ def test_get_client_ip_with_remote_addr(self):
99100

100101

101102
@pytest.mark.proxito
102-
@override_settings(PUBLIC_DOMAIN="readthedocs.io")
103+
@override_settings(
104+
PUBLIC_DOMAIN="readthedocs.io", RTD_EXTERNAL_VERSION_DOMAIN="readthedocs.build"
105+
)
103106
class AnalyticsPageViewsTests(TestCase):
104107
def setUp(self):
105108
self.project = get(
@@ -190,3 +193,17 @@ def test_increase_page_view_count(self):
190193
assert (
191194
PageView.objects.all().order_by("-date").first().view_count == 1
192195
), f"'{self.absolute_uri}' has 1 view tomorrow"
196+
197+
def test_dont_track_external_domains(self):
198+
self.assertEqual(PageView.objects.all().count(), 0)
199+
get(
200+
Version,
201+
slug="123",
202+
type=EXTERNAL,
203+
built=True,
204+
active=True,
205+
)
206+
host = f"{self.project.slug}--123.readthedocs.build"
207+
r = self.client.get(self.url, headers={"host": host})
208+
self.assertEqual(r.status_code, 204)
209+
self.assertEqual(PageView.objects.all().count(), 0)

readthedocs/core/utils/requests.py

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import structlog
2+
3+
log = structlog.get_logger(__name__) # noqa
4+
5+
6+
def is_suspicious_request(request) -> bool:
7+
"""
8+
Returns True if the request is suspicious.
9+
10+
This function is used to detect bots and spammers,
11+
we use Cloudflare to detect them.
12+
"""
13+
# This header is set from Cloudflare,
14+
# it goes from 0 to 100, 0 being low risk,
15+
# and values above 10 are bots/spammers.
16+
# https://developers.cloudflare.com/ruleset-engine/rules-language/fields/#dynamic-fields.
17+
threat_score = int(request.headers.get("X-Cloudflare-Threat-Score", 0))
18+
if threat_score > 10:
19+
log.info(
20+
"Suspicious threat score",
21+
threat_score=threat_score,
22+
)
23+
return True
24+
return False

readthedocs/proxito/tests/test_full.py

+28
Original file line numberDiff line numberDiff line change
@@ -799,6 +799,7 @@ def test_track_downloads(self):
799799
@override_settings(
800800
PYTHON_MEDIA=False,
801801
PUBLIC_DOMAIN='readthedocs.io',
802+
RTD_EXTERNAL_VERSION_DOMAIN="dev.readthedocs.build",
802803
)
803804
# We are overriding the storage class instead of using RTD_BUILD_MEDIA_STORAGE,
804805
# since the setting is evaluated just once (first test to use the storage
@@ -1466,6 +1467,33 @@ def test_track_broken_link(self, storage_exists):
14661467
self.assertEqual(pageview.view_count, 1)
14671468
self.assertEqual(pageview.status, 404)
14681469

1470+
@mock.patch.object(BuildMediaFileSystemStorageTest, "exists")
1471+
def test_dont_track_external_domains(self, storage_exists):
1472+
storage_exists.return_value = False
1473+
get(
1474+
Feature,
1475+
feature_id=Feature.RECORD_404_PAGE_VIEWS,
1476+
projects=[self.project],
1477+
)
1478+
get(
1479+
Version,
1480+
slug="123",
1481+
type=EXTERNAL,
1482+
built=True,
1483+
active=True,
1484+
)
1485+
self.assertEqual(PageView.objects.all().count(), 0)
1486+
1487+
resp = self.client.get(
1488+
reverse(
1489+
"proxito_404_handler",
1490+
kwargs={"proxito_path": "/en/123/"},
1491+
),
1492+
headers={"host": "project--123.dev.readthedocs.build"},
1493+
)
1494+
self.assertEqual(resp.status_code, 404)
1495+
self.assertEqual(PageView.objects.all().count(), 0)
1496+
14691497
@mock.patch.object(BuildMediaFileSystemStorageTest, "open")
14701498
def test_track_broken_link_custom_404(self, storage_open):
14711499
get(

readthedocs/proxito/views/serve.py

+16-21
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
unresolver,
2424
)
2525
from readthedocs.core.utils.extend import SettingsOverrideObject
26+
from readthedocs.core.utils.requests import is_suspicious_request
2627
from readthedocs.projects.constants import OLD_LANGUAGES_CODE_MAPPING, PRIVATE
2728
from readthedocs.projects.models import Domain, Feature, HTMLFile
2829
from readthedocs.projects.templatetags.projects_tags import sort_version_aware
@@ -497,13 +498,14 @@ def get(self, request, proxito_path):
497498
# and we don't want to issue infinite redirects.
498499
pass
499500

500-
# Register 404 pages into our database for user's analytics
501-
self._register_broken_link(
502-
project=project,
503-
version=version,
504-
path=filename,
505-
full_path=proxito_path,
506-
)
501+
# Register 404 pages into our database for user's analytics.
502+
if not unresolved_domain.is_from_external_domain:
503+
self._register_broken_link(
504+
project=project,
505+
version=version,
506+
filename=filename,
507+
path=proxito_path,
508+
)
507509

508510
response = self._get_custom_404_page(
509511
request=request,
@@ -521,33 +523,26 @@ def get(self, request, proxito_path):
521523
path_not_found=proxito_path,
522524
)
523525

524-
def _register_broken_link(self, project, version, path, full_path):
526+
def _register_broken_link(self, project, version, filename, path):
525527
try:
526528
if not project.has_feature(Feature.RECORD_404_PAGE_VIEWS):
527529
return
528530

529-
# This header is set from Cloudflare,
530-
# it goes from 0 to 100, 0 being low risk,
531-
# and values above 10 are bots/spammers.
532-
# https://developers.cloudflare.com/ruleset-engine/rules-language/fields/#dynamic-fields.
533-
threat_score = int(self.request.headers.get("X-Cloudflare-Threat-Score", 0))
534-
if threat_score > 10:
531+
if is_suspicious_request(self.request):
535532
log.info(
536-
"Suspicious threat score, not recording 404.",
537-
threat_score=threat_score,
533+
"Suspicious request, not recording 404.",
538534
)
539535
return
540536

541-
# If the path isn't attached to a version
542-
# it should be the same as the full_path,
537+
# If we don't have a version, the filename is the path,
543538
# otherwise it would be empty.
544539
if not version:
545-
path = full_path
540+
filename = path
546541
PageView.objects.register_page_view(
547542
project=project,
548543
version=version,
544+
filename=filename,
549545
path=path,
550-
full_path=full_path,
551546
status=404,
552547
)
553548
except Exception:
@@ -556,7 +551,7 @@ def _register_broken_link(self, project, version, path, full_path):
556551
log.exception(
557552
"Error while recording the broken link",
558553
project_slug=project.slug,
559-
full_path=full_path,
554+
path=path,
560555
)
561556

562557
def _get_custom_404_page(self, request, project, version=None):

0 commit comments

Comments
 (0)