Skip to content

Analytics: don't record page views for PR previews #11065

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions readthedocs/analytics/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,25 +25,25 @@ class PageViewManager(models.Manager):

"""Manager for PageView model."""

def register_page_view(self, project, version, path, full_path, status):
def register_page_view(self, project, version, filename, path, status):
"""Track page view with the given parameters."""
# TODO: remove after the migration of duplicate records has been completed.
if project.has_feature(Feature.DISABLE_PAGEVIEWS):
return

# Normalize paths to avoid duplicates.
filename = "/" + filename.lstrip("/")
path = "/" + path.lstrip("/")
full_path = "/" + full_path.lstrip("/")

page_view, created = self.get_or_create(
project=project,
version=version,
path=path,
path=filename,
date=timezone.now().date(),
status=status,
defaults={
"view_count": 1,
"full_path": full_path,
"full_path": path,
},
)
if not created:
Expand Down
24 changes: 19 additions & 5 deletions readthedocs/analytics/proxied_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from functools import lru_cache
from urllib.parse import urlparse

import structlog
from django.http import JsonResponse
from django.shortcuts import get_object_or_404
from rest_framework import status
Expand All @@ -13,8 +14,10 @@
from readthedocs.core.mixins import CDNCacheControlMixin
from readthedocs.core.unresolver import UnresolverError, unresolve
from readthedocs.core.utils.extend import SettingsOverrideObject
from readthedocs.core.utils.requests import is_suspicious_request
from readthedocs.projects.models import Project

log = structlog.get_logger(__name__) # noqa

class BaseAnalyticsView(CDNCacheControlMixin, APIView):

Expand Down Expand Up @@ -51,6 +54,7 @@ def _get_version(self):
return version

def get(self, request, *args, **kwargs):
# TODO: Use absolute_uri only, we don't need project and version.
project = self._get_project()
version = self._get_version()
absolute_uri = self.request.GET.get("absolute_uri")
Expand All @@ -69,24 +73,34 @@ def get(self, request, *args, **kwargs):

def increase_page_view_count(self, project, version, absolute_uri):
"""Increase the page view count for the given project."""
if is_suspicious_request(self.request):
log.info(
"Suspicious request, not recording pageview.",
url=absolute_uri,
)
return

# Don't allow tracking page views from external domains.
if self.request.unresolved_domain.is_from_external_domain:
return

try:
unresolved = unresolve(absolute_uri)
except UnresolverError:
# If we were unable to resolve the URL, it
# isn't pointing to a valid RTD project.
return

if not unresolved.filename:
# Don't track external versions.
if version.is_external or not unresolved.filename:
return
Comment on lines +95 to 96
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why version.is_external is required here? Won't be captured by the previous line if self.request.unresolved_domain.is_from_external_domain:

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The first check is to know if we are calling the API from an external domain foo--123.readthedocs.build, this check is to know if we are recording a page view for an external version from a normal domain.

We should skip calling this endpoint from the client, probably something we can do from the addons, for our current js client, we would need to return a new field to know if the version is external.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should skip calling this endpoint from the client, probably something we can do from the addons

This makes sense. Feel free to open an issue in the addons repository and I can work on it.

The first check is to know if we are calling the API from an external domain foo--123.readthedocs.build, this check is to know if we are recording a page view for an external version from a normal domain.

Would you mind adding this knowledge as a small code comment on each check?


path = unresolved.filename
full_path = urlparse(absolute_uri).path

path = urlparse(absolute_uri).path
PageView.objects.register_page_view(
project=project,
version=version,
filename=unresolved.filename,
path=path,
full_path=full_path,
status=200,
)

Expand Down
19 changes: 18 additions & 1 deletion readthedocs/analytics/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from django.utils import timezone
from django_dynamic_fixture import get

from readthedocs.builds.constants import EXTERNAL
from readthedocs.builds.models import Version
from readthedocs.projects.constants import PUBLIC
from readthedocs.projects.models import Project
Expand Down Expand Up @@ -99,7 +100,9 @@ def test_get_client_ip_with_remote_addr(self):


@pytest.mark.proxito
@override_settings(PUBLIC_DOMAIN="readthedocs.io")
@override_settings(
PUBLIC_DOMAIN="readthedocs.io", RTD_EXTERNAL_VERSION_DOMAIN="readthedocs.build"
)
class AnalyticsPageViewsTests(TestCase):
def setUp(self):
self.project = get(
Expand Down Expand Up @@ -190,3 +193,17 @@ def test_increase_page_view_count(self):
assert (
PageView.objects.all().order_by("-date").first().view_count == 1
), f"'{self.absolute_uri}' has 1 view tomorrow"

def test_dont_track_external_domains(self):
self.assertEqual(PageView.objects.all().count(), 0)
get(
Version,
slug="123",
type=EXTERNAL,
built=True,
active=True,
)
host = f"{self.project.slug}--123.readthedocs.build"
r = self.client.get(self.url, headers={"host": host})
self.assertEqual(r.status_code, 204)
self.assertEqual(PageView.objects.all().count(), 0)
24 changes: 24 additions & 0 deletions readthedocs/core/utils/requests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import structlog

log = structlog.get_logger(__name__) # noqa


def is_suspicious_request(request) -> bool:
"""
Returns True if the request is suspicious.

This function is used to detect bots and spammers,
we use Cloudflare to detect them.
"""
# This header is set from Cloudflare,
# it goes from 0 to 100, 0 being low risk,
# and values above 10 are bots/spammers.
# https://developers.cloudflare.com/ruleset-engine/rules-language/fields/#dynamic-fields.
threat_score = int(request.headers.get("X-Cloudflare-Threat-Score", 0))
if threat_score > 10:
log.info(
"Suspicious threat score",
threat_score=threat_score,
)
return True
return False
28 changes: 28 additions & 0 deletions readthedocs/proxito/tests/test_full.py
Original file line number Diff line number Diff line change
Expand Up @@ -799,6 +799,7 @@ def test_track_downloads(self):
@override_settings(
PYTHON_MEDIA=False,
PUBLIC_DOMAIN='readthedocs.io',
RTD_EXTERNAL_VERSION_DOMAIN="dev.readthedocs.build",
)
# We are overriding the storage class instead of using RTD_BUILD_MEDIA_STORAGE,
# since the setting is evaluated just once (first test to use the storage
Expand Down Expand Up @@ -1466,6 +1467,33 @@ def test_track_broken_link(self, storage_exists):
self.assertEqual(pageview.view_count, 1)
self.assertEqual(pageview.status, 404)

@mock.patch.object(BuildMediaFileSystemStorageTest, "exists")
def test_dont_track_external_domains(self, storage_exists):
storage_exists.return_value = False
get(
Feature,
feature_id=Feature.RECORD_404_PAGE_VIEWS,
projects=[self.project],
)
get(
Version,
slug="123",
type=EXTERNAL,
built=True,
active=True,
)
self.assertEqual(PageView.objects.all().count(), 0)

resp = self.client.get(
reverse(
"proxito_404_handler",
kwargs={"proxito_path": "/en/123/"},
),
headers={"host": "project--123.dev.readthedocs.build"},
)
self.assertEqual(resp.status_code, 404)
self.assertEqual(PageView.objects.all().count(), 0)

@mock.patch.object(BuildMediaFileSystemStorageTest, "open")
def test_track_broken_link_custom_404(self, storage_open):
get(
Expand Down
37 changes: 16 additions & 21 deletions readthedocs/proxito/views/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
unresolver,
)
from readthedocs.core.utils.extend import SettingsOverrideObject
from readthedocs.core.utils.requests import is_suspicious_request
from readthedocs.projects.constants import OLD_LANGUAGES_CODE_MAPPING, PRIVATE
from readthedocs.projects.models import Domain, Feature, HTMLFile
from readthedocs.projects.templatetags.projects_tags import sort_version_aware
Expand Down Expand Up @@ -497,13 +498,14 @@ def get(self, request, proxito_path):
# and we don't want to issue infinite redirects.
pass

# Register 404 pages into our database for user's analytics
self._register_broken_link(
project=project,
version=version,
path=filename,
full_path=proxito_path,
)
# Register 404 pages into our database for user's analytics.
if not unresolved_domain.is_from_external_domain:
self._register_broken_link(
project=project,
version=version,
filename=filename,
path=proxito_path,
)

response = self._get_custom_404_page(
request=request,
Expand All @@ -521,33 +523,26 @@ def get(self, request, proxito_path):
path_not_found=proxito_path,
)

def _register_broken_link(self, project, version, path, full_path):
def _register_broken_link(self, project, version, filename, path):
try:
if not project.has_feature(Feature.RECORD_404_PAGE_VIEWS):
return

# This header is set from Cloudflare,
# it goes from 0 to 100, 0 being low risk,
# and values above 10 are bots/spammers.
# https://developers.cloudflare.com/ruleset-engine/rules-language/fields/#dynamic-fields.
threat_score = int(self.request.headers.get("X-Cloudflare-Threat-Score", 0))
if threat_score > 10:
if is_suspicious_request(self.request):
log.info(
"Suspicious threat score, not recording 404.",
threat_score=threat_score,
"Suspicious request, not recording 404.",
)
return

# If the path isn't attached to a version
# it should be the same as the full_path,
# If we don't have a version, the filename is the path,
# otherwise it would be empty.
if not version:
path = full_path
filename = path
PageView.objects.register_page_view(
project=project,
version=version,
filename=filename,
path=path,
full_path=full_path,
status=404,
)
except Exception:
Expand All @@ -556,7 +551,7 @@ def _register_broken_link(self, project, version, path, full_path):
log.exception(
"Error while recording the broken link",
project_slug=project.slug,
full_path=full_path,
path=path,
)

def _get_custom_404_page(self, request, project, version=None):
Expand Down