readthedocs · stsewd · Mar 1, 2023 · Feb 24, 2023 · Feb 28, 2023 · Feb 28, 2023
@@ -1,12 +1,12 @@
 """URL resolver for documentation."""
-
 from urllib.parse import urlunparse
 
 import structlog
 from django.conf import settings
 
 from readthedocs.builds.constants import EXTERNAL
 from readthedocs.core.utils.extend import SettingsOverrideObject
+from readthedocs.core.utils.url import join_url_path
 
 log = structlog.get_logger(__name__)
 
@@ -214,6 +214,56 @@ def resolve(
         )
         return urlunparse((protocol, domain, path, '', query_params, ''))
 
+    def _get_path_prefix(self, project):
+        """
+        Returns the path prefix for a project.
+
+        If the project is a subproject, it will return ``/projects/<subproject-alias>/``.
+        If the project is a main project, it will return ``/``.
+        This will respect the custom urlconf of the project if it's defined.
+        """
+        custom_prefix = project.custom_path_prefix
+        parent_relationship = project.get_parent_relationship()
+        if parent_relationship:
+            prefix = custom_prefix or "projects"
+            return join_url_path(prefix, parent_relationship.alias, "/")
+
+        prefix = custom_prefix or "/"
+        return join_url_path(prefix, "/")
+
+    def get_url_prefix(self, project, external_version_slug=None):
+        """
+        Get the URL prefix from where the documentation of ``project`` is served from.
+
+        This doesn't include the version or language. For example:
+
+        - https://docs.example.com/projects/<project-slug>/
+        - https://docs.readthedocs.io/
+
+        This will respect the custom urlconf of the project if it's defined.
+
+        :param project: Project object to get the root URL from
+        :param external_version_slug: If given, resolve using the external version domain.
+        """
+        canonical_project = self._get_canonical_project(project)
+        use_custom_domain = self._use_cname(canonical_project)
+        custom_domain = canonical_project.get_canonical_custom_domain()
+        if external_version_slug:
+            domain = self._get_external_subdomain(
+                canonical_project, external_version_slug
+            )
+            use_https = settings.PUBLIC_DOMAIN_USES_HTTPS
+        elif use_custom_domain and custom_domain:
+            domain = custom_domain.domain
+            use_https = custom_domain.https
+        else:
+            domain = self._get_project_subdomain(canonical_project)
+            use_https = settings.PUBLIC_DOMAIN_USES_HTTPS
+
+        protocol = "https" if use_https else "http"
+        path = self._get_path_prefix(project)
+        return urlunparse((protocol, domain, path, "", "", ""))
+
     def _get_canonical_project_data(self, project):
         """
         Returns a tuple with (project, subproject_slug) from the canonical project of `project`.

@@ -0,0 +1,21 @@
+"""URL hadling utilities."""
+
+
+def join_url_path(base, *args):
+    """
+    Joins a base URL path with one or more path components.
+
+    This does a simple join of the base path with the path components,
+    inserting a slash between each component.
+    The resulting path will always start with a slash.
+
+    .. warning::
+
+       This does not offer protection against directory traversal attacks,
+       it simply joins the path components together. This shouldn't be used
+       to serve files, use ``readthedocs.storage.utils.safe_join`` for that.
+    """
+    base = "/" + base.lstrip("/")
+    for path in args:
+        base = base.rstrip("/") + "/" + path.lstrip("/")
+    return base
@@ -31,6 +31,7 @@
 from readthedocs.core.history import ExtraHistoricalRecords
 from readthedocs.core.resolver import resolve, resolve_domain
 from readthedocs.core.utils import slugify
+from readthedocs.core.utils.url import join_url_path
 from readthedocs.domains.querysets import DomainQueryset
 from readthedocs.projects import constants
 from readthedocs.projects.exceptions import ProjectConfigurationError
@@ -641,8 +642,8 @@ def proxied_api_host(self):
         if self.urlconf:
             # Add our proxied api host at the first place we have a $variable
             # This supports both subpaths & normal root hosting
-            url_prefix = self.urlconf.split('$', 1)[0]
-            return '/' + url_prefix.strip('/') + '/_'
+            path_prefix = self.custom_path_prefix
+            return join_url_path(path_prefix, "/_")
         return '/_'
 
     @property
@@ -659,6 +660,17 @@ def proxied_static_path(self):
         """Path for static files hosted on the user's doc domain."""
         return f"{self.proxied_api_host}/static/"
 
+    @property
+    def custom_path_prefix(self):
+        """
+        Get the path prefix from the custom urlconf.
+
+        Returns `None` if the project doesn't have a custom urlconf.
+        """
+        if self.urlconf:
+            return self.urlconf.split("$", 1)[0]
+        return None
+
     @property
     def regex_urlconf(self):
         """
@@ -764,12 +776,12 @@ class ProxitoURLConf:
 
         return ProxitoURLConf
 
-    @property
+    @cached_property
     def is_subproject(self):
         """Return whether or not this project is a subproject."""
         return self.superprojects.exists()
 
-    @property
+    @cached_property
     def superproject(self):
         relationship = self.get_parent_relationship()
         if relationship:

@@ -5,6 +5,41 @@ Module in charge of serving documentation pages.
 
 Read the Docs core team members can view the `Proxito design doc <https://github.com/readthedocs/el-proxito/blob/master/docs/design/architecture.rst>`_
 
+URL parts
+---------
+
+In our code we use the following terms to refer to the different parts of the URL:
+
+url:
+   The full URL, for example ``https://docs.readthedocs.io/en/latest/index.html``.
+path:
+   The whole path from the URL, for example ``/en/latest/index.html``.
+domain:
+   The domain/subdomain without the protocol, for example ``docs.readthedocs.io``.
+language:
+   The language of the documentation, for example ``en``.
+version:
+   The version of the documentation, for example ``latest``.
+filename:
+   The name of the file being served, for example ``index.html``.
+path prefix:
+   The path prefix of the URL without version or language,
+   for a normal project this is ``/``, and for subprojects this is ``/projects/<subproject-alias>/``.
+
+.. code:: text
+
+                         URL
+   |------------------------------------------------|
+                                        path
+                              |---------------------|
+    https://docs.readthedocs.io/en/latest/index.html
+   |-------|-------------------|--|------|----------|
+    protocol         |          |     |       |
+                   domain       |     |       |
+                             language |       |
+                                    version   |
+                                           filename
+
 CDN
 ---
 

@@ -1460,11 +1460,13 @@ def test_cache_on_private_versions_custom_domain(self):
         self.domain.save()
         self._test_cache_control_header_project(expected_value='private', host=self.domain.domain)
 
-        # HTTPS redirect respects the privacy level of the version.
-        resp = self.client.get('/en/latest/', secure=False, HTTP_HOST=self.domain.domain)
-        self.assertEqual(resp['Location'], f'https://{self.domain.domain}/en/latest/')
-        self.assertEqual(resp.headers['CDN-Cache-Control'], 'private')
-        self.assertEqual(resp.headers['Cache-Tag'], 'project,project:latest')
+        # HTTPS redirects can always be cached.
+        resp = self.client.get(
+            "/en/latest/", secure=False, HTTP_HOST=self.domain.domain
+        )
+        self.assertEqual(resp["Location"], f"https://{self.domain.domain}/en/latest/")
+        self.assertEqual(resp.headers["CDN-Cache-Control"], "public")
+        self.assertEqual(resp.headers["Cache-Tag"], "project")
 
     def test_cache_public_versions(self):
         self.project.versions.update(privacy_level=PUBLIC)
@@ -1492,15 +1494,18 @@ def test_cache_on_private_versions_custom_domain_subproject(self):
         self.domain.save()
         self._test_cache_control_header_subproject(expected_value='private', host=self.domain.domain)
 
-        # HTTPS redirect respects the privacy level of the version.
+        # HTTPS redirects can always be cached.
         resp = self.client.get(
             '/projects/subproject/en/latest/',
             secure=False,
             HTTP_HOST=self.domain.domain,
         )
-        self.assertEqual(resp['Location'], f'https://{self.domain.domain}/projects/subproject/en/latest/')
-        self.assertEqual(resp.headers['CDN-Cache-Control'], 'private')
-        self.assertEqual(resp.headers['Cache-Tag'], 'subproject,subproject:latest')
+        self.assertEqual(
+            resp["Location"],
+            f"https://{self.domain.domain}/projects/subproject/en/latest/",
+        )
+        self.assertEqual(resp.headers["CDN-Cache-Control"], "public")
+        self.assertEqual(resp.headers["Cache-Tag"], "project")
 
     def test_cache_public_versions_subproject(self):
         self.subproject.versions.update(privacy_level=PUBLIC)
@@ -1512,15 +1517,18 @@ def test_cache_public_versions_custom_domain(self):
         self.domain.save()
         self._test_cache_control_header_subproject(expected_value='public', host=self.domain.domain)
 
-        # HTTPS redirect respects the privacy level of the version.
+        # HTTPS redirects can always be cached.
         resp = self.client.get(
             '/projects/subproject/en/latest/',
             secure=False,
             HTTP_HOST=self.domain.domain,
         )
-        self.assertEqual(resp['Location'], f'https://{self.domain.domain}/projects/subproject/en/latest/')
-        self.assertEqual(resp.headers['CDN-Cache-Control'], 'public')
-        self.assertEqual(resp.headers['Cache-Tag'], 'subproject,subproject:latest')
+        self.assertEqual(
+            resp["Location"],
+            f"https://{self.domain.domain}/projects/subproject/en/latest/",
+        )
+        self.assertEqual(resp.headers["CDN-Cache-Control"], "public")
+        self.assertEqual(resp.headers["Cache-Tag"], "project")
 
     def test_cache_disable_on_rtd_header_resolved_project(self):
         get(

@@ -116,8 +116,12 @@ def test_subproject_redirect(self):
         )
         resp = self.client.get(self.url, HTTP_HOST="subproject.dev.readthedocs.io")
         self.assertEqual(resp.status_code, 302)
-        self.assertEqual(resp["location"], f"http://pip.dev.readthedocs.io/")
-        self.assertEqual(resp["X-RTD-Redirect"], RedirectType.to_canonical_domain.name)
+        self.assertEqual(
+            resp["location"], f"http://pip.dev.readthedocs.io/projects/subproject/"
+        )
+        self.assertEqual(
+            resp["X-RTD-Redirect"], RedirectType.subproject_to_main_domain.name
+        )
 
     # We are not canonicalizing custom domains -> public domain for now
     @pytest.mark.xfail(strict=True)

@@ -83,7 +83,17 @@ def test_subproject_redirect(self):
         r = self.client.get('/', HTTP_HOST='subproject.dev.readthedocs.io')
         self.assertEqual(r.status_code, 302)
         self.assertEqual(
-            r['Location'], 'https://project.dev.readthedocs.io/projects/subproject/en/latest/',
+            r["Location"],
+            "https://project.dev.readthedocs.io/projects/subproject/",
+        )
+
+        r = self.client.get(
+            "/projects/subproject/", HTTP_HOST="project.dev.readthedocs.io"
+        )
+        self.assertEqual(r.status_code, 302)
+        self.assertEqual(
+            r["Location"],
+            "https://project.dev.readthedocs.io/projects/subproject/en/latest/",
         )
 
         r = self.client.get('/en/latest/', HTTP_HOST='subproject.dev.readthedocs.io')

@@ -19,7 +19,8 @@
 from readthedocs.analytics.utils import get_client_ip
 from readthedocs.audit.models import AuditLog
 from readthedocs.builds.constants import EXTERNAL, INTERNAL
-from readthedocs.core.resolver import resolve
+from readthedocs.core.resolver import resolve, resolver
+from readthedocs.core.utils.url import join_url_path
 from readthedocs.projects.constants import MEDIA_TYPE_HTML
 from readthedocs.proxito.constants import RedirectType
 from readthedocs.redirects.exceptions import InfiniteRedirectException
@@ -337,10 +338,8 @@ def canonical_redirect(
         self,
         request,
         final_project,
-        version_slug,
-        filename,
         redirect_type,
-        is_external_version=False,
+        external_version_slug=None,
     ):
         """
         Return a redirect to the canonical domain including scheme.
@@ -358,34 +357,34 @@ def canonical_redirect(
 
         :param request: Request object.
         :param final_project: The current project being served.
-        :param version_slug: The current version slug being served.
-        :param filename: The filename being served.
         :param redirect_type: The type of canonical redirect (https, canonical-cname, subproject-main-domain)
-        :param external: If the version is from a pull request preview.
+        :param external_version_slug: The version slug if the request is from a pull request preview.
         """
         from_url = request.build_absolute_uri()
         parsed_from = urlparse(from_url)
 
         if redirect_type == RedirectType.http_to_https:
+            # We only need to change the protocol.
             to = parsed_from._replace(scheme="https").geturl()
-        elif redirect_type in [
-            RedirectType.to_canonical_domain,
-            RedirectType.subproject_to_main_domain,
-        ]:
-            to = resolve(
+        elif redirect_type == RedirectType.to_canonical_domain:
+            # We need to change the domain and protocol.
+            canonical_domain = final_project.get_canonical_custom_domain()
+            protocol = "https" if canonical_domain.https else "http"
+            to = parsed_from._replace(
+                scheme=protocol, netloc=canonical_domain.domain
+            ).geturl()
+        elif redirect_type == RedirectType.subproject_to_main_domain:
+            # We need to get the subproject root in the domain of the main
+            # project, and append the current path.
+            project_doc_prefix = resolver.get_url_prefix(
                 project=final_project,
-                version_slug=version_slug,
-                filename=filename,
-                query_params=parsed_from.query,
-                external=is_external_version,
+                external_version_slug=external_version_slug,
             )
-            # When a canonical redirect is done, only change the domain.
-            if redirect_type == RedirectType.to_canonical_domain:
-                parsed_to = urlparse(to)
-                to = parsed_from._replace(
-                    scheme=parsed_to.scheme,
-                    netloc=parsed_to.netloc,
-                ).geturl()
+            parsed_doc_prefix = urlparse(project_doc_prefix)
+            to = parsed_doc_prefix._replace(
+                path=join_url_path(parsed_doc_prefix.path, parsed_from.path),
+                query=parsed_from.query,
+            ).geturl()
         else:
             raise NotImplementedError
 
@@ -397,7 +396,11 @@ def canonical_redirect(
             )
             raise InfiniteRedirectException()
 
-        log.info('Canonical Redirect.', host=request.get_host(), from_url=filename, to_url=to)
+        log.info(
+            "Canonical Redirect.", host=request.get_host(), from_url=from_url, to_url=to
+        )
+        # Canocanical redirects can be cached, since the final URL will check for authz.
+        self.cache_request = True
         resp = HttpResponseRedirect(to)
         resp["X-RTD-Redirect"] = redirect_type.name
         return resp