Skip to content

Proxito: refactor canonical redirects #10069

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Mar 1, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 10 additions & 12 deletions readthedocs/core/resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,28 +214,26 @@ def resolve(
)
return urlunparse((protocol, domain, path, '', query_params, ''))

def _get_root_path(self, project):
def _get_path_prefix(self, project):
"""
Returns the root path for a project.
Returns the path prefix for a project.

If the project is a subproject, it will return ``/projects/<project-slug>/``.
If the project is a subproject, it will return ``/projects/<subproject-alias>/``.
If the project is a main project, it will return ``/``.
This will respect the custom urlconf of the project if it's defined.
"""
custom_prefix = None
if project.urlconf:
custom_prefix = project.urlconf.split("$", 1)[0]

if project.is_subproject:
custom_prefix = project.custom_path_prefix
parent_relationship = project.get_parent_relationship()
if parent_relationship:
prefix = custom_prefix or "projects"
return join_url_path(prefix, project.slug, "/")
return join_url_path(prefix, parent_relationship.alias, "/")

prefix = custom_prefix or "/"
return join_url_path(prefix, "/")

def resolve_root(self, project, external_version_slug=None):
def get_url_prefix(self, project, external_version_slug=None):
"""
Get the root URL from where the documentation of ``project`` is served from.
Get the URL prefix from where the documentation of ``project`` is served from.

This doesn't include the version or language. For example:

Expand Down Expand Up @@ -263,7 +261,7 @@ def resolve_root(self, project, external_version_slug=None):
use_https = settings.PUBLIC_DOMAIN_USES_HTTPS

protocol = "https" if use_https else "http"
path = self._get_root_path(project)
path = self._get_path_prefix(project)
return urlunparse((protocol, domain, path, "", "", ""))

def _get_canonical_project_data(self, project):
Expand Down
8 changes: 7 additions & 1 deletion readthedocs/core/utils/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,13 @@ def join_url_path(base, *args):

This does a simple join of the base path with the path components,
inserting a slash between each component.
It does not offer protection against directory traversal attacks.
The resulting path will always start with a slash.

.. warning::

This does not offer protection against directory traversal attacks,
it simply joins the path components together. This shouldn't be used
to serve files, use ``readthedocs.storage.utils.safe_join`` for that.
"""
base = "/" + base.lstrip("/")
for path in args:
Expand Down
18 changes: 15 additions & 3 deletions readthedocs/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from readthedocs.core.history import ExtraHistoricalRecords
from readthedocs.core.resolver import resolve, resolve_domain
from readthedocs.core.utils import slugify
from readthedocs.core.utils.url import join_url_path
from readthedocs.domains.querysets import DomainQueryset
from readthedocs.projects import constants
from readthedocs.projects.exceptions import ProjectConfigurationError
Expand Down Expand Up @@ -630,8 +631,8 @@ def proxied_api_host(self):
if self.urlconf:
# Add our proxied api host at the first place we have a $variable
# This supports both subpaths & normal root hosting
url_prefix = self.urlconf.split('$', 1)[0]
return '/' + url_prefix.strip('/') + '/_'
path_prefix = self.custom_path_prefix
return join_url_path(path_prefix, "/_")
return '/_'

@property
Expand All @@ -648,6 +649,17 @@ def proxied_static_path(self):
"""Path for static files hosted on the user's doc domain."""
return f"{self.proxied_api_host}/static/"

@property
def custom_path_prefix(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel these name changes a lot easier to read and follow 👍🏼

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Naming is hard :)

"""
Get the path prefix from the custom urlconf.

Returns `None` if the project doesn't have a custom urlconf.
"""
if self.urlconf:
return self.urlconf.split("$", 1)[0]
return None

@property
def regex_urlconf(self):
"""
Expand Down Expand Up @@ -758,7 +770,7 @@ def is_subproject(self):
"""Return whether or not this project is a subproject."""
return self.superprojects.exists()

@property
@cached_property
def superproject(self):
relationship = self.get_parent_relationship()
if relationship:
Expand Down
38 changes: 36 additions & 2 deletions readthedocs/proxito/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,41 @@ Module in charge of serving documentation pages.

Read the Docs core team members can view the `Proxito design doc <https://github.com/readthedocs/el-proxito/blob/master/docs/design/architecture.rst>`_

URL parts
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a really good start! 💯

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, really happy to see this.

---------

In our code we use the following terms to refer to the different parts of the URL:

url:
The full URL, for example ``https://docs.readthedocs.io/en/latest/index.html``.
path:
The whole path from the URL, for example ``/en/latest/index.html``.
domain:
The domain/subdomain without the protocol, for example ``docs.readthedocs.io``.
language:
The language of the documentation, for example ``en``.
version:
The version of the documentation, for example ``latest``.
filename:
The name of the file being served, for example ``index.html``.
path prefix:
The path prefix of the URL without version or language,
for a normal project this is ``/``, and for subprojects this is ``/projects/<subproject-alias>/``.

.. code:: text

URL
|------------------------------------------------|
path
|---------------------|
https://docs.readthedocs.io/en/latest/index.html
|-------|-------------------|--|------|----------|
protocol | | | |
domain | | |
language | |
version |
filename
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💯

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is great. Though this is varying a bit from urlparse in some ways which make sense, but we should be explicit if we don't care about the other parts (eg. query & fragment).

urlparse("scheme://netloc/path;parameters?query#fragment")

https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urlparse


CDN
---

Expand Down Expand Up @@ -39,9 +74,7 @@ What can/can't be cached?

- ServeRobotsTXT: can be cached, we don't serve a custom robots.txt
to any user if the default version is private.
This view is already cached at the application level.
- ServeSitemapXML: can be cached. It displays only public versions, for everyone.
This view is already cached at the application level.
- ServeStaticFiles: can be cached, all files are the same for all projects and users.
- Embed API: can be cached for public versions.
- Search:
Expand All @@ -54,3 +87,4 @@ What can/can't be cached?
- If the project doesn't have subprojects.
- All subprojects are public.
- Analytics API: can't be cached, we want to always hit our serves with this one.
- Health check view: shouldn't be cached, we want to always hit our serves with this one.
9 changes: 9 additions & 0 deletions readthedocs/proxito/tests/test_redirects.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,15 @@ def test_subproject_redirect(self):
"https://project.dev.readthedocs.io/projects/subproject/",
)

r = self.client.get(
"/projects/subproject/", HTTP_HOST="project.dev.readthedocs.io"
)
self.assertEqual(r.status_code, 302)
self.assertEqual(
r["Location"],
"https://project.dev.readthedocs.io/projects/subproject/en/latest/",
)

r = self.client.get('/en/latest/', HTTP_HOST='subproject.dev.readthedocs.io')
self.assertEqual(r.status_code, 302)
self.assertEqual(
Expand Down
8 changes: 4 additions & 4 deletions readthedocs/proxito/views/mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,13 +370,13 @@ def canonical_redirect(
elif redirect_type == RedirectType.subproject_to_main_domain:
# We need to get the subproject root in the domain of the main
# project, and append the current path.
project_doc_root = resolver.resolve_root(
project_doc_prefix = resolver.get_url_prefix(
project=final_project,
external_version_slug=external_version_slug,
)
parsed_doc_root = urlparse(project_doc_root)
to = parsed_doc_root._replace(
path=join_url_path(parsed_doc_root.path, parsed_from.path),
parsed_doc_prefix = urlparse(project_doc_prefix)
to = parsed_doc_prefix._replace(
path=join_url_path(parsed_doc_prefix.path, parsed_from.path),
query=parsed_from.query,
).geturl()
else:
Expand Down
12 changes: 9 additions & 3 deletions readthedocs/proxito/views/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,11 +90,15 @@ def get(self,
so that we can decide if we need to serve docs or add a /.
"""
unresolved_domain = request.unresolved_domain
# Handle requests that need canonicalizing,
# Handle requests that need canonicalizing first,
# e.g. HTTP -> HTTPS, redirect to canonical domain, etc.
# We run this here to reduce work we need to do on easily cached responses.
# It's slower for the end user to have multiple HTTP round trips,
# but reduces chances for URL resolving bugs,
# and makes caching more effective because we don't care about authz.
redirect_type = self._get_canonical_redirect_type(request)
if redirect_type:
# Attach the current project to the request.
# TODO: find a better way to pass this to the middleware.
request.path_project_slug = unresolved_domain.project.slug
try:
return self.canonical_redirect(
Expand All @@ -104,7 +108,9 @@ def get(self,
redirect_type=redirect_type,
)
except InfiniteRedirectException:
# Don't redirect in this case, since it would break things.
# ``canonical_redirect`` raises this when it's redirecting back to itself.
# We can safely ignore it here because it's logged in ``canonical_redirect``,
# and we don't want to issue infinite redirects.
pass

version_slug = self.get_version_from_host(request, version_slug)
Expand Down