diff --git a/readthedocs/core/resolver.py b/readthedocs/core/resolver.py index 204be38fc09..24946148921 100644 --- a/readthedocs/core/resolver.py +++ b/readthedocs/core/resolver.py @@ -1,5 +1,4 @@ """URL resolver for documentation.""" - from urllib.parse import urlunparse import structlog @@ -7,6 +6,7 @@ from readthedocs.builds.constants import EXTERNAL from readthedocs.core.utils.extend import SettingsOverrideObject +from readthedocs.core.utils.url import unsafe_join_url_path log = structlog.get_logger(__name__) @@ -214,6 +214,56 @@ def resolve( ) return urlunparse((protocol, domain, path, '', query_params, '')) + def _get_path_prefix(self, project): + """ + Returns the path prefix for a project. + + If the project is a subproject, it will return ``/projects//``. + If the project is a main project, it will return ``/``. + This will respect the custom urlconf of the project if it's defined. + """ + custom_prefix = project.custom_path_prefix + parent_relationship = project.get_parent_relationship() + if parent_relationship: + prefix = custom_prefix or "projects" + return unsafe_join_url_path(prefix, parent_relationship.alias, "/") + + prefix = custom_prefix or "/" + return unsafe_join_url_path(prefix, "/") + + def get_url_prefix(self, project, external_version_slug=None): + """ + Get the URL prefix from where the documentation of ``project`` is served from. + + This doesn't include the version or language. For example: + + - https://docs.example.com/projects// + - https://docs.readthedocs.io/ + + This will respect the custom urlconf of the project if it's defined. + + :param project: Project object to get the root URL from + :param external_version_slug: If given, resolve using the external version domain. + """ + canonical_project = self._get_canonical_project(project) + use_custom_domain = self._use_cname(canonical_project) + custom_domain = canonical_project.get_canonical_custom_domain() + if external_version_slug: + domain = self._get_external_subdomain( + canonical_project, external_version_slug + ) + use_https = settings.PUBLIC_DOMAIN_USES_HTTPS + elif use_custom_domain and custom_domain: + domain = custom_domain.domain + use_https = custom_domain.https + else: + domain = self._get_project_subdomain(canonical_project) + use_https = settings.PUBLIC_DOMAIN_USES_HTTPS + + protocol = "https" if use_https else "http" + path = self._get_path_prefix(project) + return urlunparse((protocol, domain, path, "", "", "")) + def _get_canonical_project_data(self, project): """ Returns a tuple with (project, subproject_slug) from the canonical project of `project`. diff --git a/readthedocs/core/utils/url.py b/readthedocs/core/utils/url.py new file mode 100644 index 00000000000..e1e564df925 --- /dev/null +++ b/readthedocs/core/utils/url.py @@ -0,0 +1,21 @@ +"""URL hadling utilities.""" + + +def unsafe_join_url_path(base, *args): + """ + Joins a base URL path with one or more path components. + + This does a simple join of the base path with the path components, + inserting a slash between each component. + The resulting path will always start with a slash. + + .. warning:: + + This does not offer protection against directory traversal attacks, + it simply joins the path components together. This shouldn't be used + to serve files, use ``readthedocs.storage.utils.safe_join`` for that. + """ + base = "/" + base.lstrip("/") + for path in args: + base = base.rstrip("/") + "/" + path.lstrip("/") + return base diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index a7343cc2021..3705d5a1133 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -31,6 +31,7 @@ from readthedocs.core.history import ExtraHistoricalRecords from readthedocs.core.resolver import resolve, resolve_domain from readthedocs.core.utils import slugify +from readthedocs.core.utils.url import unsafe_join_url_path from readthedocs.domains.querysets import DomainQueryset from readthedocs.projects import constants from readthedocs.projects.exceptions import ProjectConfigurationError @@ -645,8 +646,8 @@ def proxied_api_host(self): if self.urlconf: # Add our proxied api host at the first place we have a $variable # This supports both subpaths & normal root hosting - url_prefix = self.urlconf.split('$', 1)[0] - return '/' + url_prefix.strip('/') + '/_' + path_prefix = self.custom_path_prefix + return unsafe_join_url_path(path_prefix, "/_") return '/_' @property @@ -663,6 +664,19 @@ def proxied_static_path(self): """Path for static files hosted on the user's doc domain.""" return f"{self.proxied_api_host}/static/" + @property + def custom_path_prefix(self): + """ + Get the path prefix from the custom urlconf. + + Returns `None` if the project doesn't have a custom urlconf. + """ + if self.urlconf: + # Return the value before the first defined variable, + # as that is a prefix and not part of our normal doc patterns. + return self.urlconf.split("$", 1)[0] + return None + @property def regex_urlconf(self): """ @@ -768,12 +782,12 @@ class ProxitoURLConf: return ProxitoURLConf - @property + @cached_property def is_subproject(self): """Return whether or not this project is a subproject.""" return self.superprojects.exists() - @property + @cached_property def superproject(self): relationship = self.get_parent_relationship() if relationship: diff --git a/readthedocs/proxito/README.rst b/readthedocs/proxito/README.rst index 1d33c741d40..0c9afcc0a02 100644 --- a/readthedocs/proxito/README.rst +++ b/readthedocs/proxito/README.rst @@ -5,6 +5,43 @@ Module in charge of serving documentation pages. Read the Docs core team members can view the `Proxito design doc `_ +URL parts +--------- + +In our code we use the following terms to refer to the different parts of the URL: + +url: + The full URL including the protocol, for example ``https://docs.readthedocs.io/en/latest/api/index.html``. +path: + The whole path from the URL without query arguments or fragment, + for example ``/en/latest/api/index.html``. +domain: + The domain/subdomain without the protocol, for example ``docs.readthedocs.io``. +language: + The language of the documentation, for example ``en``. +version: + The version of the documentation, for example ``latest``. +filename: + The name of the file being served, for example ``/api/index.html``. +path prefix: + The path prefix of the URL without version or language, + for a normal project this is ``/``, and for subprojects this is ``/projects//``. + This prefix can be different for project defining their own urlconf. + +.. code:: text + + URL + |----------------------------------------------------| + path + |-------------------------| + https://docs.readthedocs.io/en/latest/api/index.html + |-------|-------------------|--|------|--------------| + protocol | | | | + domain | | | + language | | + version | + filename + CDN --- @@ -39,9 +76,7 @@ What can/can't be cached? - ServeRobotsTXT: can be cached, we don't serve a custom robots.txt to any user if the default version is private. - This view is already cached at the application level. - ServeSitemapXML: can be cached. It displays only public versions, for everyone. - This view is already cached at the application level. - ServeStaticFiles: can be cached, all files are the same for all projects and users. - Embed API: can be cached for public versions. - Search: @@ -54,3 +89,4 @@ What can/can't be cached? - If the project doesn't have subprojects. - All subprojects are public. - Analytics API: can't be cached, we want to always hit our serves with this one. +- Health check view: shouldn't be cached, we always want to hit our serves with this one. diff --git a/readthedocs/proxito/tests/test_full.py b/readthedocs/proxito/tests/test_full.py index 6a697a5f182..509dd388681 100644 --- a/readthedocs/proxito/tests/test_full.py +++ b/readthedocs/proxito/tests/test_full.py @@ -1479,13 +1479,13 @@ def test_cache_on_private_versions_custom_domain(self): self.domain.save() self._test_cache_control_header_project(expected_value='private', host=self.domain.domain) - # HTTPS redirect can always be cached. + # HTTPS redirects can always be cached. resp = self.client.get( "/en/latest/", secure=False, HTTP_HOST=self.domain.domain ) self.assertEqual(resp["Location"], f"https://{self.domain.domain}/en/latest/") self.assertEqual(resp.headers["CDN-Cache-Control"], "public") - self.assertEqual(resp.headers["Cache-Tag"], "project,project:latest") + self.assertEqual(resp.headers["Cache-Tag"], "project") def test_cache_public_versions(self): self.project.versions.update(privacy_level=PUBLIC) @@ -1513,7 +1513,7 @@ def test_cache_on_private_versions_custom_domain_subproject(self): self.domain.save() self._test_cache_control_header_subproject(expected_value='private', host=self.domain.domain) - # HTTPS redirect can always be cached. + # HTTPS redirects can always be cached. resp = self.client.get( '/projects/subproject/en/latest/', secure=False, @@ -1524,7 +1524,7 @@ def test_cache_on_private_versions_custom_domain_subproject(self): f"https://{self.domain.domain}/projects/subproject/en/latest/", ) self.assertEqual(resp.headers["CDN-Cache-Control"], "public") - self.assertEqual(resp.headers["Cache-Tag"], "subproject,subproject:latest") + self.assertEqual(resp.headers["Cache-Tag"], "project") def test_cache_public_versions_subproject(self): self.subproject.versions.update(privacy_level=PUBLIC) @@ -1536,15 +1536,18 @@ def test_cache_public_versions_custom_domain(self): self.domain.save() self._test_cache_control_header_subproject(expected_value='public', host=self.domain.domain) - # HTTPS redirect respects the privacy level of the version. + # HTTPS redirects can always be cached. resp = self.client.get( '/projects/subproject/en/latest/', secure=False, HTTP_HOST=self.domain.domain, ) - self.assertEqual(resp['Location'], f'https://{self.domain.domain}/projects/subproject/en/latest/') - self.assertEqual(resp.headers['CDN-Cache-Control'], 'public') - self.assertEqual(resp.headers['Cache-Tag'], 'subproject,subproject:latest') + self.assertEqual( + resp["Location"], + f"https://{self.domain.domain}/projects/subproject/en/latest/", + ) + self.assertEqual(resp.headers["CDN-Cache-Control"], "public") + self.assertEqual(resp.headers["Cache-Tag"], "project") def test_cache_disable_on_rtd_header_resolved_project(self): get( diff --git a/readthedocs/proxito/tests/test_middleware.py b/readthedocs/proxito/tests/test_middleware.py index c4408e94ec8..3b92268b158 100644 --- a/readthedocs/proxito/tests/test_middleware.py +++ b/readthedocs/proxito/tests/test_middleware.py @@ -116,8 +116,12 @@ def test_subproject_redirect(self): ) resp = self.client.get(self.url, HTTP_HOST="subproject.dev.readthedocs.io") self.assertEqual(resp.status_code, 302) - self.assertEqual(resp["location"], f"http://pip.dev.readthedocs.io/") - self.assertEqual(resp["X-RTD-Redirect"], RedirectType.to_canonical_domain.name) + self.assertEqual( + resp["location"], f"http://pip.dev.readthedocs.io/projects/subproject/" + ) + self.assertEqual( + resp["X-RTD-Redirect"], RedirectType.subproject_to_main_domain.name + ) # We are not canonicalizing custom domains -> public domain for now @pytest.mark.xfail(strict=True) diff --git a/readthedocs/proxito/tests/test_redirects.py b/readthedocs/proxito/tests/test_redirects.py index b5ce223c1bf..1a1c5c5236d 100644 --- a/readthedocs/proxito/tests/test_redirects.py +++ b/readthedocs/proxito/tests/test_redirects.py @@ -83,7 +83,17 @@ def test_subproject_redirect(self): r = self.client.get('/', HTTP_HOST='subproject.dev.readthedocs.io') self.assertEqual(r.status_code, 302) self.assertEqual( - r['Location'], 'https://project.dev.readthedocs.io/projects/subproject/en/latest/', + r["Location"], + "https://project.dev.readthedocs.io/projects/subproject/", + ) + + r = self.client.get( + "/projects/subproject/", HTTP_HOST="project.dev.readthedocs.io" + ) + self.assertEqual(r.status_code, 302) + self.assertEqual( + r["Location"], + "https://project.dev.readthedocs.io/projects/subproject/en/latest/", ) r = self.client.get('/en/latest/', HTTP_HOST='subproject.dev.readthedocs.io') diff --git a/readthedocs/proxito/views/mixins.py b/readthedocs/proxito/views/mixins.py index 4b8953d734a..812f000ce4f 100644 --- a/readthedocs/proxito/views/mixins.py +++ b/readthedocs/proxito/views/mixins.py @@ -19,7 +19,8 @@ from readthedocs.analytics.utils import get_client_ip from readthedocs.audit.models import AuditLog from readthedocs.builds.constants import EXTERNAL, INTERNAL -from readthedocs.core.resolver import resolve +from readthedocs.core.resolver import resolve, resolver +from readthedocs.core.utils.url import unsafe_join_url_path from readthedocs.projects.constants import MEDIA_TYPE_HTML from readthedocs.proxito.constants import RedirectType from readthedocs.redirects.exceptions import InfiniteRedirectException @@ -339,10 +340,8 @@ def canonical_redirect( self, request, final_project, - version_slug, - filename, redirect_type, - is_external_version=False, + external_version_slug=None, ): """ Return a redirect to the canonical domain including scheme. @@ -360,34 +359,34 @@ def canonical_redirect( :param request: Request object. :param final_project: The current project being served. - :param version_slug: The current version slug being served. - :param filename: The filename being served. :param redirect_type: The type of canonical redirect (https, canonical-cname, subproject-main-domain) - :param external: If the version is from a pull request preview. + :param external_version_slug: The version slug if the request is from a pull request preview. """ from_url = request.build_absolute_uri() parsed_from = urlparse(from_url) if redirect_type == RedirectType.http_to_https: + # We only need to change the protocol. to = parsed_from._replace(scheme="https").geturl() - elif redirect_type in [ - RedirectType.to_canonical_domain, - RedirectType.subproject_to_main_domain, - ]: - to = resolve( + elif redirect_type == RedirectType.to_canonical_domain: + # We need to change the domain and protocol. + canonical_domain = final_project.get_canonical_custom_domain() + protocol = "https" if canonical_domain.https else "http" + to = parsed_from._replace( + scheme=protocol, netloc=canonical_domain.domain + ).geturl() + elif redirect_type == RedirectType.subproject_to_main_domain: + # We need to get the subproject root in the domain of the main + # project, and append the current path. + project_doc_prefix = resolver.get_url_prefix( project=final_project, - version_slug=version_slug, - filename=filename, - query_params=parsed_from.query, - external=is_external_version, + external_version_slug=external_version_slug, ) - # When a canonical redirect is done, only change the domain. - if redirect_type == RedirectType.to_canonical_domain: - parsed_to = urlparse(to) - to = parsed_from._replace( - scheme=parsed_to.scheme, - netloc=parsed_to.netloc, - ).geturl() + parsed_doc_prefix = urlparse(project_doc_prefix) + to = parsed_doc_prefix._replace( + path=unsafe_join_url_path(parsed_doc_prefix.path, parsed_from.path), + query=parsed_from.query, + ).geturl() else: raise NotImplementedError @@ -399,7 +398,9 @@ def canonical_redirect( ) raise InfiniteRedirectException() - log.info('Canonical Redirect.', host=request.get_host(), from_url=filename, to_url=to) + log.info( + "Canonical Redirect.", host=request.get_host(), from_url=from_url, to_url=to + ) # All canonical redirects can be cached, since the final URL will check for authz. self.cache_response = True resp = HttpResponseRedirect(to) diff --git a/readthedocs/proxito/views/serve.py b/readthedocs/proxito/views/serve.py index 5e3e5bf7bde..7ef1d809a22 100644 --- a/readthedocs/proxito/views/serve.py +++ b/readthedocs/proxito/views/serve.py @@ -83,6 +83,29 @@ def get(self, so that we can decide if we need to serve docs or add a /. """ unresolved_domain = request.unresolved_domain + # Handle requests that need canonicalizing first, + # e.g. HTTP -> HTTPS, redirect to canonical domain, etc. + # We run this here to reduce work we need to do on easily cached responses. + # It's slower for the end user to have multiple HTTP round trips, + # but reduces chances for URL resolving bugs, + # and makes caching more effective because we don't care about authz. + redirect_type = self._get_canonical_redirect_type(request) + if redirect_type: + # TODO: find a better way to pass this to the middleware. + request.path_project_slug = unresolved_domain.project.slug + try: + return self.canonical_redirect( + request=request, + final_project=unresolved_domain.project, + external_version_slug=unresolved_domain.external_version_slug, + redirect_type=redirect_type, + ) + except InfiniteRedirectException: + # ``canonical_redirect`` raises this when it's redirecting back to itself. + # We can safely ignore it here because it's logged in ``canonical_redirect``, + # and we don't want to issue infinite redirects. + pass + original_version_slug = version_slug version_slug = self.get_version_from_host(request, version_slug) final_project, lang_slug, version_slug, filename = _get_project_data_from_request( # noqa @@ -145,22 +168,6 @@ def get(self, self.cache_response = True return spam_response - # Handle requests that need canonicalizing (eg. HTTP -> HTTPS, redirect to canonical domain) - redirect_type = self._get_canonical_redirect_type(request) - if redirect_type: - try: - return self.canonical_redirect( - request=request, - final_project=final_project, - version_slug=version_slug, - filename=filename, - redirect_type=redirect_type, - is_external_version=is_external, - ) - except InfiniteRedirectException: - # Don't redirect in this case, since it would break things - pass - # Handle a / redirect when we aren't a single version if all([ lang_slug is None, @@ -248,6 +255,16 @@ def _get_canonical_redirect_type(self, request): log.debug("Proxito CNAME HTTPS Redirect.", domain=domain.domain) return RedirectType.http_to_https + # Check for subprojects before checking for canonical domains, + # so we can redirect to the main domain first. + # Custom domains on subprojects are not supported. + if project.is_subproject: + log.debug( + "Proxito Public Domain -> Subproject Main Domain Redirect.", + project_slug=project.slug, + ) + return RedirectType.subproject_to_main_domain + if unresolved_domain.is_from_public_domain: canonical_domain = ( Domain.objects.filter(project=project) @@ -261,13 +278,6 @@ def _get_canonical_redirect_type(self, request): ) return RedirectType.to_canonical_domain - if project.is_subproject: - log.debug( - "Proxito Public Domain -> Subproject Main Domain Redirect.", - project_slug=project.slug, - ) - return RedirectType.subproject_to_main_domain - return None