readthedocs · stsewd · Mar 1, 2023 · Feb 24, 2023 · Feb 28, 2023 · Feb 28, 2023
@@ -214,28 +214,26 @@ def resolve(
         )
         return urlunparse((protocol, domain, path, '', query_params, ''))
 
-    def _get_root_path(self, project):
+    def _get_path_prefix(self, project):
         """
-        Returns the root path for a project.
+        Returns the path prefix for a project.
 
-        If the project is a subproject, it will return ``/projects/<project-slug>/``.
+        If the project is a subproject, it will return ``/projects/<subproject-alias>/``.
         If the project is a main project, it will return ``/``.
         This will respect the custom urlconf of the project if it's defined.
         """
-        custom_prefix = None
-        if project.urlconf:
-            custom_prefix = project.urlconf.split("$", 1)[0]
-
-        if project.is_subproject:
+        custom_prefix = project.custom_path_prefix
+        parent_relationship = project.get_parent_relationship()
+        if parent_relationship:
             prefix = custom_prefix or "projects"
-            return join_url_path(prefix, project.slug, "/")
+            return join_url_path(prefix, parent_relationship.alias, "/")
 
         prefix = custom_prefix or "/"
         return join_url_path(prefix, "/")
 
-    def resolve_root(self, project, external_version_slug=None):
+    def get_url_prefix(self, project, external_version_slug=None):
         """
-        Get the root URL from where the documentation of ``project`` is served from.
+        Get the URL prefix from where the documentation of ``project`` is served from.
 
         This doesn't include the version or language. For example:
 
@@ -263,7 +261,7 @@ def resolve_root(self, project, external_version_slug=None):
             use_https = settings.PUBLIC_DOMAIN_USES_HTTPS
 
         protocol = "https" if use_https else "http"
-        path = self._get_root_path(project)
+        path = self._get_path_prefix(project)
         return urlunparse((protocol, domain, path, "", "", ""))
 
     def _get_canonical_project_data(self, project):

@@ -7,7 +7,13 @@ def join_url_path(base, *args):
 
     This does a simple join of the base path with the path components,
     inserting a slash between each component.
-    It does not offer protection against directory traversal attacks.
+    The resulting path will always start with a slash.
+
+    .. warning::
+
+       This does not offer protection against directory traversal attacks,
+       it simply joins the path components together. This shouldn't be used
+       to serve files, use ``readthedocs.storage.utils.safe_join`` for that.
     """
     base = "/" + base.lstrip("/")
     for path in args:

@@ -31,6 +31,7 @@
 from readthedocs.core.history import ExtraHistoricalRecords
 from readthedocs.core.resolver import resolve, resolve_domain
 from readthedocs.core.utils import slugify
+from readthedocs.core.utils.url import join_url_path
 from readthedocs.domains.querysets import DomainQueryset
 from readthedocs.projects import constants
 from readthedocs.projects.exceptions import ProjectConfigurationError
@@ -630,8 +631,8 @@ def proxied_api_host(self):
         if self.urlconf:
             # Add our proxied api host at the first place we have a $variable
             # This supports both subpaths & normal root hosting
-            url_prefix = self.urlconf.split('$', 1)[0]
-            return '/' + url_prefix.strip('/') + '/_'
+            path_prefix = self.custom_path_prefix
+            return join_url_path(path_prefix, "/_")
         return '/_'
 
     @property
@@ -648,6 +649,17 @@ def proxied_static_path(self):
         """Path for static files hosted on the user's doc domain."""
         return f"{self.proxied_api_host}/static/"
 
+    @property
+    def custom_path_prefix(self):
+        """
+        Get the path prefix from the custom urlconf.
+
+        Returns `None` if the project doesn't have a custom urlconf.
+        """
+        if self.urlconf:
+            return self.urlconf.split("$", 1)[0]
+        return None
+
     @property
     def regex_urlconf(self):
         """
@@ -758,7 +770,7 @@ def is_subproject(self):
         """Return whether or not this project is a subproject."""
         return self.superprojects.exists()
 
-    @property
+    @cached_property
     def superproject(self):
         relationship = self.get_parent_relationship()
         if relationship:

@@ -5,6 +5,41 @@ Module in charge of serving documentation pages.
 
 Read the Docs core team members can view the `Proxito design doc <https://github.com/readthedocs/el-proxito/blob/master/docs/design/architecture.rst>`_
 
+URL parts
+---------
+
+In our code we use the following terms to refer to the different parts of the URL:
+
+url:
+   The full URL, for example ``https://docs.readthedocs.io/en/latest/index.html``.
+path:
+   The whole path from the URL, for example ``/en/latest/index.html``.
+domain:
+   The domain/subdomain without the protocol, for example ``docs.readthedocs.io``.
+language:
+   The language of the documentation, for example ``en``.
+version:
+   The version of the documentation, for example ``latest``.
+filename:
+   The name of the file being served, for example ``index.html``.
+path prefix:
+   The path prefix of the URL without version or language,
+   for a normal project this is ``/``, and for subprojects this is ``/projects/<subproject-alias>/``.
+
+.. code:: text
+
+                         URL
+   |------------------------------------------------|
+                                        path
+                              |---------------------|
+    https://docs.readthedocs.io/en/latest/index.html
+   |-------|-------------------|--|------|----------|
+    protocol         |          |     |       |
+                   domain       |     |       |
+                             language |       |
+                                    version   |
+                                           filename
+
 CDN
 ---
 
@@ -39,9 +74,7 @@ What can/can't be cached?
 
 - ServeRobotsTXT: can be cached, we don't serve a custom robots.txt
   to any user if the default version is private.
-  This view is already cached at the application level.
 - ServeSitemapXML: can be cached. It displays only public versions, for everyone.
-  This view is already cached at the application level.
 - ServeStaticFiles: can be cached, all files are the same for all projects and users.
 - Embed API: can be cached for public versions.
 - Search:
@@ -54,3 +87,4 @@ What can/can't be cached?
   - If the project doesn't have subprojects.
   - All subprojects are public.
 - Analytics API: can't be cached, we want to always hit our serves with this one.
+- Health check view: shouldn't be cached, we want to always hit our serves with this one.
@@ -87,6 +87,15 @@ def test_subproject_redirect(self):
             "https://project.dev.readthedocs.io/projects/subproject/",
         )
 
+        r = self.client.get(
+            "/projects/subproject/", HTTP_HOST="project.dev.readthedocs.io"
+        )
+        self.assertEqual(r.status_code, 302)
+        self.assertEqual(
+            r["Location"],
+            "https://project.dev.readthedocs.io/projects/subproject/en/latest/",
+        )
+
         r = self.client.get('/en/latest/', HTTP_HOST='subproject.dev.readthedocs.io')
         self.assertEqual(r.status_code, 302)
         self.assertEqual(

@@ -370,13 +370,13 @@ def canonical_redirect(
         elif redirect_type == RedirectType.subproject_to_main_domain:
             # We need to get the subproject root in the domain of the main
             # project, and append the current path.
-            project_doc_root = resolver.resolve_root(
+            project_doc_prefix = resolver.get_url_prefix(
                 project=final_project,
                 external_version_slug=external_version_slug,
             )
-            parsed_doc_root = urlparse(project_doc_root)
-            to = parsed_doc_root._replace(
-                path=join_url_path(parsed_doc_root.path, parsed_from.path),
+            parsed_doc_prefix = urlparse(project_doc_prefix)
+            to = parsed_doc_prefix._replace(
+                path=join_url_path(parsed_doc_prefix.path, parsed_from.path),
                 query=parsed_from.query,
             ).geturl()
         else:

@@ -90,11 +90,15 @@ def get(self,
         so that we can decide if we need to serve docs or add a /.
         """
         unresolved_domain = request.unresolved_domain
-        # Handle requests that need canonicalizing,
+        # Handle requests that need canonicalizing first,
         # e.g. HTTP -> HTTPS, redirect to canonical domain, etc.
+        # We run this here to reduce work we need to do on easily cached responses.
+        # It's slower for the end user to have multiple HTTP round trips,
+        # but reduces chances for URL resolving bugs,
+        # and makes caching more effective because we don't care about authz.
         redirect_type = self._get_canonical_redirect_type(request)
         if redirect_type:
-            # Attach the current project to the request.
+            # TODO: find a better way to pass this to the middleware.
             request.path_project_slug = unresolved_domain.project.slug
             try:
                 return self.canonical_redirect(
@@ -104,7 +108,9 @@ def get(self,
                     redirect_type=redirect_type,
                 )
             except InfiniteRedirectException:
-                # Don't redirect in this case, since it would break things.
+                # ``canonical_redirect`` raises this when it's redirecting back to itself.
+                # We can safely ignore it here because it's logged in ``canonical_redirect``,
+                # and we don't want to issue infinite redirects.
                 pass
 
         version_slug = self.get_version_from_host(request, version_slug)