readthedocs · stsewd · Jan 29, 2024 · Jan 25, 2024 · Jan 25, 2024 · Jan 25, 2024
@@ -25,25 +25,25 @@ class PageViewManager(models.Manager):
 
     """Manager for PageView model."""
 
-    def register_page_view(self, project, version, path, full_path, status):
+    def register_page_view(self, project, version, filename, path, status):
         """Track page view with the given parameters."""
         # TODO: remove after the migration of duplicate records has been completed.
         if project.has_feature(Feature.DISABLE_PAGEVIEWS):
             return
 
         # Normalize paths to avoid duplicates.
+        filename = "/" + filename.lstrip("/")
         path = "/" + path.lstrip("/")
-        full_path = "/" + full_path.lstrip("/")
 
         page_view, created = self.get_or_create(
             project=project,
             version=version,
-            path=path,
+            path=filename,
             date=timezone.now().date(),
             status=status,
             defaults={
                 "view_count": 1,
-                "full_path": full_path,
+                "full_path": path,
             },
         )
         if not created:

@@ -2,6 +2,7 @@
 from functools import lru_cache
 from urllib.parse import urlparse
 
+import structlog
 from django.http import JsonResponse
 from django.shortcuts import get_object_or_404
 from rest_framework import status
@@ -13,8 +14,10 @@
 from readthedocs.core.mixins import CDNCacheControlMixin
 from readthedocs.core.unresolver import UnresolverError, unresolve
 from readthedocs.core.utils.extend import SettingsOverrideObject
+from readthedocs.core.utils.requests import is_suspicious_request
 from readthedocs.projects.models import Project
 
+log = structlog.get_logger(__name__)  # noqa
 
 class BaseAnalyticsView(CDNCacheControlMixin, APIView):
 
@@ -51,6 +54,7 @@ def _get_version(self):
         return version
 
     def get(self, request, *args, **kwargs):
+        # TODO: Use absolute_uri only, we don't need project and version.
         project = self._get_project()
         version = self._get_version()
         absolute_uri = self.request.GET.get("absolute_uri")
@@ -69,24 +73,34 @@ def get(self, request, *args, **kwargs):
 
     def increase_page_view_count(self, project, version, absolute_uri):
         """Increase the page view count for the given project."""
+        if is_suspicious_request(self.request):
+            log.info(
+                "Suspicious request, not recording pageview.",
+                url=absolute_uri,
+            )
+            return
+
+        # Don't allow tracking page views from external domains.
+        if self.request.unresolved_domain.is_from_external_domain:
+            return
+
         try:
             unresolved = unresolve(absolute_uri)
         except UnresolverError:
             # If we were unable to resolve the URL, it
             # isn't pointing to a valid RTD project.
             return
 
-        if not unresolved.filename:
+        # Don't track external versions.
+        if version.is_external or not unresolved.filename:
             return
 
-        path = unresolved.filename
-        full_path = urlparse(absolute_uri).path
-
+        path = urlparse(absolute_uri).path
         PageView.objects.register_page_view(
             project=project,
             version=version,
+            filename=unresolved.filename,
             path=path,
-            full_path=full_path,
             status=200,
         )
 

@@ -6,6 +6,7 @@
 from django.utils import timezone
 from django_dynamic_fixture import get
 
+from readthedocs.builds.constants import EXTERNAL
 from readthedocs.builds.models import Version
 from readthedocs.projects.constants import PUBLIC
 from readthedocs.projects.models import Project
@@ -99,7 +100,9 @@ def test_get_client_ip_with_remote_addr(self):
 
 
 @pytest.mark.proxito
-@override_settings(PUBLIC_DOMAIN="readthedocs.io")
+@override_settings(
+    PUBLIC_DOMAIN="readthedocs.io", RTD_EXTERNAL_VERSION_DOMAIN="readthedocs.build"
+)
 class AnalyticsPageViewsTests(TestCase):
     def setUp(self):
         self.project = get(
@@ -190,3 +193,17 @@ def test_increase_page_view_count(self):
             assert (
                 PageView.objects.all().order_by("-date").first().view_count == 1
             ), f"'{self.absolute_uri}' has 1 view tomorrow"
+
+    def test_dont_track_external_domains(self):
+        self.assertEqual(PageView.objects.all().count(), 0)
+        get(
+            Version,
+            slug="123",
+            type=EXTERNAL,
+            built=True,
+            active=True,
+        )
+        host = f"{self.project.slug}--123.readthedocs.build"
+        r = self.client.get(self.url, headers={"host": host})
+        self.assertEqual(r.status_code, 204)
+        self.assertEqual(PageView.objects.all().count(), 0)
@@ -0,0 +1,24 @@
+import structlog
+
+log = structlog.get_logger(__name__)  # noqa
+
+
+def is_suspicious_request(request) -> bool:
+    """
+    Returns True if the request is suspicious.
+
+    This function is used to detect bots and spammers,
+    we use Cloudflare to detect them.
+    """
+    # This header is set from Cloudflare,
+    # it goes from 0 to 100, 0 being low risk,
+    # and values above 10 are bots/spammers.
+    # https://developers.cloudflare.com/ruleset-engine/rules-language/fields/#dynamic-fields.
+    threat_score = int(request.headers.get("X-Cloudflare-Threat-Score", 0))
+    if threat_score > 10:
+        log.info(
+            "Suspicious threat score",
+            threat_score=threat_score,
+        )
+        return True
+    return False
@@ -799,6 +799,7 @@ def test_track_downloads(self):
 @override_settings(
     PYTHON_MEDIA=False,
     PUBLIC_DOMAIN='readthedocs.io',
+    RTD_EXTERNAL_VERSION_DOMAIN="dev.readthedocs.build",
 )
 # We are overriding the storage class instead of using RTD_BUILD_MEDIA_STORAGE,
 # since the setting is evaluated just once (first test to use the storage
@@ -1466,6 +1467,33 @@ def test_track_broken_link(self, storage_exists):
         self.assertEqual(pageview.view_count, 1)
         self.assertEqual(pageview.status, 404)
 
+    @mock.patch.object(BuildMediaFileSystemStorageTest, "exists")
+    def test_dont_track_external_domains(self, storage_exists):
+        storage_exists.return_value = False
+        get(
+            Feature,
+            feature_id=Feature.RECORD_404_PAGE_VIEWS,
+            projects=[self.project],
+        )
+        get(
+            Version,
+            slug="123",
+            type=EXTERNAL,
+            built=True,
+            active=True,
+        )
+        self.assertEqual(PageView.objects.all().count(), 0)
+
+        resp = self.client.get(
+            reverse(
+                "proxito_404_handler",
+                kwargs={"proxito_path": "/en/123/"},
+            ),
+            headers={"host": "project--123.dev.readthedocs.build"},
+        )
+        self.assertEqual(resp.status_code, 404)
+        self.assertEqual(PageView.objects.all().count(), 0)
+
     @mock.patch.object(BuildMediaFileSystemStorageTest, "open")
     def test_track_broken_link_custom_404(self, storage_open):
         get(

@@ -23,6 +23,7 @@
     unresolver,
 )
 from readthedocs.core.utils.extend import SettingsOverrideObject
+from readthedocs.core.utils.requests import is_suspicious_request
 from readthedocs.projects.constants import OLD_LANGUAGES_CODE_MAPPING, PRIVATE
 from readthedocs.projects.models import Domain, Feature, HTMLFile
 from readthedocs.projects.templatetags.projects_tags import sort_version_aware
@@ -497,13 +498,14 @@ def get(self, request, proxito_path):
                 # and we don't want to issue infinite redirects.
                 pass
 
-        # Register 404 pages into our database for user's analytics
-        self._register_broken_link(
-            project=project,
-            version=version,
-            path=filename,
-            full_path=proxito_path,
-        )
+        # Register 404 pages into our database for user's analytics.
+        if not unresolved_domain.is_from_external_domain:
+            self._register_broken_link(
+                project=project,
+                version=version,
+                filename=filename,
+                path=proxito_path,
+            )
 
         response = self._get_custom_404_page(
             request=request,
@@ -521,33 +523,26 @@ def get(self, request, proxito_path):
             path_not_found=proxito_path,
         )
 
-    def _register_broken_link(self, project, version, path, full_path):
+    def _register_broken_link(self, project, version, filename, path):
         try:
             if not project.has_feature(Feature.RECORD_404_PAGE_VIEWS):
                 return
 
-            # This header is set from Cloudflare,
-            # it goes from 0 to 100, 0 being low risk,
-            # and values above 10 are bots/spammers.
-            # https://developers.cloudflare.com/ruleset-engine/rules-language/fields/#dynamic-fields.
-            threat_score = int(self.request.headers.get("X-Cloudflare-Threat-Score", 0))
-            if threat_score > 10:
+            if is_suspicious_request(self.request):
                 log.info(
-                    "Suspicious threat score, not recording 404.",
-                    threat_score=threat_score,
+                    "Suspicious request, not recording 404.",
                 )
                 return
 
-            # If the path isn't attached to a version
-            # it should be the same as the full_path,
+            # If we don't have a version, the filename is the path,
             # otherwise it would be empty.
             if not version:
-                path = full_path
+                filename = path
             PageView.objects.register_page_view(
                 project=project,
                 version=version,
+                filename=filename,
                 path=path,
-                full_path=full_path,
                 status=404,
             )
         except Exception:
@@ -556,7 +551,7 @@ def _register_broken_link(self, project, version, path, full_path):
             log.exception(
                 "Error while recording the broken link",
                 project_slug=project.slug,
-                full_path=full_path,
+                path=path,
             )
 
     def _get_custom_404_page(self, request, project, version=None):