Skip to content

Proxito: refactor canonical redirects #10069

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Mar 1, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 51 additions & 1 deletion readthedocs/core/resolver.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
"""URL resolver for documentation."""

from urllib.parse import urlunparse

import structlog
from django.conf import settings

from readthedocs.builds.constants import EXTERNAL
from readthedocs.core.utils.extend import SettingsOverrideObject
from readthedocs.core.utils.url import join_url_path

log = structlog.get_logger(__name__)

Expand Down Expand Up @@ -214,6 +214,56 @@ def resolve(
)
return urlunparse((protocol, domain, path, '', query_params, ''))

def _get_path_prefix(self, project):
"""
Returns the path prefix for a project.

If the project is a subproject, it will return ``/projects/<subproject-alias>/``.
If the project is a main project, it will return ``/``.
This will respect the custom urlconf of the project if it's defined.
"""
custom_prefix = project.custom_path_prefix
parent_relationship = project.get_parent_relationship()
if parent_relationship:
prefix = custom_prefix or "projects"
return join_url_path(prefix, parent_relationship.alias, "/")

prefix = custom_prefix or "/"
return join_url_path(prefix, "/")

def get_url_prefix(self, project, external_version_slug=None):
"""
Get the URL prefix from where the documentation of ``project`` is served from.

This doesn't include the version or language. For example:

- https://docs.example.com/projects/<project-slug>/
- https://docs.readthedocs.io/

This will respect the custom urlconf of the project if it's defined.

:param project: Project object to get the root URL from
:param external_version_slug: If given, resolve using the external version domain.
"""
canonical_project = self._get_canonical_project(project)
use_custom_domain = self._use_cname(canonical_project)
custom_domain = canonical_project.get_canonical_custom_domain()
Comment on lines +249 to +250
Copy link
Member

@ericholscher ericholscher Feb 27, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems odd to have both of these... should custom_domain not take this into account? Should we pass use_custom_domain into.get_canonical_custom_domain()

Where is the canonical code that's responsible for deciding if a domain is used?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We override the _use_cname method on .com to check for the plan.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about overriding .get_canonical_custom_domain() instead? Would that be doable and simpler?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Once we unify feature check, we won't be needing overrides.

if external_version_slug:
domain = self._get_external_subdomain(
canonical_project, external_version_slug
)
use_https = settings.PUBLIC_DOMAIN_USES_HTTPS
elif use_custom_domain and custom_domain:
domain = custom_domain.domain
use_https = custom_domain.https
else:
domain = self._get_project_subdomain(canonical_project)
use_https = settings.PUBLIC_DOMAIN_USES_HTTPS

protocol = "https" if use_https else "http"
path = self._get_path_prefix(project)
return urlunparse((protocol, domain, path, "", "", ""))

def _get_canonical_project_data(self, project):
"""
Returns a tuple with (project, subproject_slug) from the canonical project of `project`.
Expand Down
21 changes: 21 additions & 0 deletions readthedocs/core/utils/url.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
"""URL hadling utilities."""


def join_url_path(base, *args):
"""
Joins a base URL path with one or more path components.

This does a simple join of the base path with the path components,
inserting a slash between each component.
The resulting path will always start with a slash.

.. warning::

This does not offer protection against directory traversal attacks,
it simply joins the path components together. This shouldn't be used
to serve files, use ``readthedocs.storage.utils.safe_join`` for that.
"""
base = "/" + base.lstrip("/")
for path in args:
base = base.rstrip("/") + "/" + path.lstrip("/")
return base
20 changes: 16 additions & 4 deletions readthedocs/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from readthedocs.core.history import ExtraHistoricalRecords
from readthedocs.core.resolver import resolve, resolve_domain
from readthedocs.core.utils import slugify
from readthedocs.core.utils.url import join_url_path
from readthedocs.domains.querysets import DomainQueryset
from readthedocs.projects import constants
from readthedocs.projects.exceptions import ProjectConfigurationError
Expand Down Expand Up @@ -641,8 +642,8 @@ def proxied_api_host(self):
if self.urlconf:
# Add our proxied api host at the first place we have a $variable
# This supports both subpaths & normal root hosting
url_prefix = self.urlconf.split('$', 1)[0]
return '/' + url_prefix.strip('/') + '/_'
path_prefix = self.custom_path_prefix
return join_url_path(path_prefix, "/_")
return '/_'

@property
Expand All @@ -659,6 +660,17 @@ def proxied_static_path(self):
"""Path for static files hosted on the user's doc domain."""
return f"{self.proxied_api_host}/static/"

@property
def custom_path_prefix(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel these name changes a lot easier to read and follow 👍🏼

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Naming is hard :)

"""
Get the path prefix from the custom urlconf.

Returns `None` if the project doesn't have a custom urlconf.
"""
if self.urlconf:
return self.urlconf.split("$", 1)[0]
return None

@property
def regex_urlconf(self):
"""
Expand Down Expand Up @@ -764,12 +776,12 @@ class ProxitoURLConf:

return ProxitoURLConf

@property
@cached_property
def is_subproject(self):
"""Return whether or not this project is a subproject."""
return self.superprojects.exists()

@property
@cached_property
def superproject(self):
relationship = self.get_parent_relationship()
if relationship:
Expand Down
35 changes: 35 additions & 0 deletions readthedocs/proxito/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,41 @@ Module in charge of serving documentation pages.

Read the Docs core team members can view the `Proxito design doc <https://github.com/readthedocs/el-proxito/blob/master/docs/design/architecture.rst>`_

URL parts
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a really good start! 💯

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, really happy to see this.

---------

In our code we use the following terms to refer to the different parts of the URL:

url:
The full URL, for example ``https://docs.readthedocs.io/en/latest/index.html``.
path:
The whole path from the URL, for example ``/en/latest/index.html``.
domain:
The domain/subdomain without the protocol, for example ``docs.readthedocs.io``.
language:
The language of the documentation, for example ``en``.
version:
The version of the documentation, for example ``latest``.
filename:
The name of the file being served, for example ``index.html``.
path prefix:
The path prefix of the URL without version or language,
for a normal project this is ``/``, and for subprojects this is ``/projects/<subproject-alias>/``.

.. code:: text

URL
|------------------------------------------------|
path
|---------------------|
https://docs.readthedocs.io/en/latest/index.html
|-------|-------------------|--|------|----------|
protocol | | | |
domain | | |
language | |
version |
filename
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💯

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is great. Though this is varying a bit from urlparse in some ways which make sense, but we should be explicit if we don't care about the other parts (eg. query & fragment).

urlparse("scheme://netloc/path;parameters?query#fragment")

https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urlparse


CDN
---

Expand Down
34 changes: 21 additions & 13 deletions readthedocs/proxito/tests/test_full.py
Original file line number Diff line number Diff line change
Expand Up @@ -1460,11 +1460,13 @@ def test_cache_on_private_versions_custom_domain(self):
self.domain.save()
self._test_cache_control_header_project(expected_value='private', host=self.domain.domain)

# HTTPS redirect respects the privacy level of the version.
resp = self.client.get('/en/latest/', secure=False, HTTP_HOST=self.domain.domain)
self.assertEqual(resp['Location'], f'https://{self.domain.domain}/en/latest/')
self.assertEqual(resp.headers['CDN-Cache-Control'], 'private')
self.assertEqual(resp.headers['Cache-Tag'], 'project,project:latest')
# HTTPS redirects can always be cached.
resp = self.client.get(
"/en/latest/", secure=False, HTTP_HOST=self.domain.domain
)
self.assertEqual(resp["Location"], f"https://{self.domain.domain}/en/latest/")
self.assertEqual(resp.headers["CDN-Cache-Control"], "public")
self.assertEqual(resp.headers["Cache-Tag"], "project")

def test_cache_public_versions(self):
self.project.versions.update(privacy_level=PUBLIC)
Expand Down Expand Up @@ -1492,15 +1494,18 @@ def test_cache_on_private_versions_custom_domain_subproject(self):
self.domain.save()
self._test_cache_control_header_subproject(expected_value='private', host=self.domain.domain)

# HTTPS redirect respects the privacy level of the version.
# HTTPS redirects can always be cached.
resp = self.client.get(
'/projects/subproject/en/latest/',
secure=False,
HTTP_HOST=self.domain.domain,
)
self.assertEqual(resp['Location'], f'https://{self.domain.domain}/projects/subproject/en/latest/')
self.assertEqual(resp.headers['CDN-Cache-Control'], 'private')
self.assertEqual(resp.headers['Cache-Tag'], 'subproject,subproject:latest')
self.assertEqual(
resp["Location"],
f"https://{self.domain.domain}/projects/subproject/en/latest/",
)
self.assertEqual(resp.headers["CDN-Cache-Control"], "public")
self.assertEqual(resp.headers["Cache-Tag"], "project")

def test_cache_public_versions_subproject(self):
self.subproject.versions.update(privacy_level=PUBLIC)
Expand All @@ -1512,15 +1517,18 @@ def test_cache_public_versions_custom_domain(self):
self.domain.save()
self._test_cache_control_header_subproject(expected_value='public', host=self.domain.domain)

# HTTPS redirect respects the privacy level of the version.
# HTTPS redirects can always be cached.
resp = self.client.get(
'/projects/subproject/en/latest/',
secure=False,
HTTP_HOST=self.domain.domain,
)
self.assertEqual(resp['Location'], f'https://{self.domain.domain}/projects/subproject/en/latest/')
self.assertEqual(resp.headers['CDN-Cache-Control'], 'public')
self.assertEqual(resp.headers['Cache-Tag'], 'subproject,subproject:latest')
self.assertEqual(
resp["Location"],
f"https://{self.domain.domain}/projects/subproject/en/latest/",
)
self.assertEqual(resp.headers["CDN-Cache-Control"], "public")
self.assertEqual(resp.headers["Cache-Tag"], "project")

def test_cache_disable_on_rtd_header_resolved_project(self):
get(
Expand Down
8 changes: 6 additions & 2 deletions readthedocs/proxito/tests/test_middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,12 @@ def test_subproject_redirect(self):
)
resp = self.client.get(self.url, HTTP_HOST="subproject.dev.readthedocs.io")
self.assertEqual(resp.status_code, 302)
self.assertEqual(resp["location"], f"http://pip.dev.readthedocs.io/")
self.assertEqual(resp["X-RTD-Redirect"], RedirectType.to_canonical_domain.name)
self.assertEqual(
resp["location"], f"http://pip.dev.readthedocs.io/projects/subproject/"
)
self.assertEqual(
resp["X-RTD-Redirect"], RedirectType.subproject_to_main_domain.name
)

# We are not canonicalizing custom domains -> public domain for now
@pytest.mark.xfail(strict=True)
Expand Down
12 changes: 11 additions & 1 deletion readthedocs/proxito/tests/test_redirects.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,17 @@ def test_subproject_redirect(self):
r = self.client.get('/', HTTP_HOST='subproject.dev.readthedocs.io')
self.assertEqual(r.status_code, 302)
self.assertEqual(
r['Location'], 'https://project.dev.readthedocs.io/projects/subproject/en/latest/',
r["Location"],
"https://project.dev.readthedocs.io/projects/subproject/",
)

r = self.client.get(
"/projects/subproject/", HTTP_HOST="project.dev.readthedocs.io"
)
self.assertEqual(r.status_code, 302)
self.assertEqual(
r["Location"],
"https://project.dev.readthedocs.io/projects/subproject/en/latest/",
)

r = self.client.get('/en/latest/', HTTP_HOST='subproject.dev.readthedocs.io')
Expand Down
51 changes: 27 additions & 24 deletions readthedocs/proxito/views/mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
from readthedocs.analytics.utils import get_client_ip
from readthedocs.audit.models import AuditLog
from readthedocs.builds.constants import EXTERNAL, INTERNAL
from readthedocs.core.resolver import resolve
from readthedocs.core.resolver import resolve, resolver
from readthedocs.core.utils.url import join_url_path
from readthedocs.projects.constants import MEDIA_TYPE_HTML
from readthedocs.proxito.constants import RedirectType
from readthedocs.redirects.exceptions import InfiniteRedirectException
Expand Down Expand Up @@ -337,10 +338,8 @@ def canonical_redirect(
self,
request,
final_project,
version_slug,
filename,
redirect_type,
is_external_version=False,
external_version_slug=None,
):
"""
Return a redirect to the canonical domain including scheme.
Expand All @@ -358,34 +357,34 @@ def canonical_redirect(

:param request: Request object.
:param final_project: The current project being served.
:param version_slug: The current version slug being served.
:param filename: The filename being served.
:param redirect_type: The type of canonical redirect (https, canonical-cname, subproject-main-domain)
:param external: If the version is from a pull request preview.
:param external_version_slug: The version slug if the request is from a pull request preview.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to pass this in, or can we pull it off the request?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can get it from the unresolved_domain from the request, but it felt more explicit not having these methods depend on the request having all that.

"""
from_url = request.build_absolute_uri()
parsed_from = urlparse(from_url)

if redirect_type == RedirectType.http_to_https:
# We only need to change the protocol.
to = parsed_from._replace(scheme="https").geturl()
elif redirect_type in [
RedirectType.to_canonical_domain,
RedirectType.subproject_to_main_domain,
]:
to = resolve(
elif redirect_type == RedirectType.to_canonical_domain:
# We need to change the domain and protocol.
canonical_domain = final_project.get_canonical_custom_domain()
protocol = "https" if canonical_domain.https else "http"
to = parsed_from._replace(
scheme=protocol, netloc=canonical_domain.domain
).geturl()
elif redirect_type == RedirectType.subproject_to_main_domain:
# We need to get the subproject root in the domain of the main
# project, and append the current path.
project_doc_prefix = resolver.get_url_prefix(
project=final_project,
version_slug=version_slug,
filename=filename,
query_params=parsed_from.query,
external=is_external_version,
external_version_slug=external_version_slug,
)
# When a canonical redirect is done, only change the domain.
if redirect_type == RedirectType.to_canonical_domain:
parsed_to = urlparse(to)
to = parsed_from._replace(
scheme=parsed_to.scheme,
netloc=parsed_to.netloc,
).geturl()
parsed_doc_prefix = urlparse(project_doc_prefix)
to = parsed_doc_prefix._replace(
path=join_url_path(parsed_doc_prefix.path, parsed_from.path),
query=parsed_from.query,
).geturl()
else:
raise NotImplementedError

Expand All @@ -397,7 +396,11 @@ def canonical_redirect(
)
raise InfiniteRedirectException()

log.info('Canonical Redirect.', host=request.get_host(), from_url=filename, to_url=to)
log.info(
"Canonical Redirect.", host=request.get_host(), from_url=from_url, to_url=to
)
# Canocanical redirects can be cached, since the final URL will check for authz.
self.cache_request = True
resp = HttpResponseRedirect(to)
resp["X-RTD-Redirect"] = redirect_type.name
return resp
Expand Down
Loading