Skip to content

Commit 033aa6c

Browse files
authored
Proxito: refactor canonical redirects (#10069)
Implementing the plan from #10065. - I'm caching all the redirects by default now, since the final URL will be checked for authz - Cache tags now are given by the project from the unresolved domain, since we are caching the response on that domain! Previous behavior wasn't correct. Closes #10065
1 parent 8ba7a05 commit 033aa6c

File tree

9 files changed

+214
-65
lines changed

9 files changed

+214
-65
lines changed

readthedocs/core/resolver.py

+51-1
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
"""URL resolver for documentation."""
2-
32
from urllib.parse import urlunparse
43

54
import structlog
65
from django.conf import settings
76

87
from readthedocs.builds.constants import EXTERNAL
98
from readthedocs.core.utils.extend import SettingsOverrideObject
9+
from readthedocs.core.utils.url import unsafe_join_url_path
1010

1111
log = structlog.get_logger(__name__)
1212

@@ -214,6 +214,56 @@ def resolve(
214214
)
215215
return urlunparse((protocol, domain, path, '', query_params, ''))
216216

217+
def _get_path_prefix(self, project):
218+
"""
219+
Returns the path prefix for a project.
220+
221+
If the project is a subproject, it will return ``/projects/<subproject-alias>/``.
222+
If the project is a main project, it will return ``/``.
223+
This will respect the custom urlconf of the project if it's defined.
224+
"""
225+
custom_prefix = project.custom_path_prefix
226+
parent_relationship = project.get_parent_relationship()
227+
if parent_relationship:
228+
prefix = custom_prefix or "projects"
229+
return unsafe_join_url_path(prefix, parent_relationship.alias, "/")
230+
231+
prefix = custom_prefix or "/"
232+
return unsafe_join_url_path(prefix, "/")
233+
234+
def get_url_prefix(self, project, external_version_slug=None):
235+
"""
236+
Get the URL prefix from where the documentation of ``project`` is served from.
237+
238+
This doesn't include the version or language. For example:
239+
240+
- https://docs.example.com/projects/<project-slug>/
241+
- https://docs.readthedocs.io/
242+
243+
This will respect the custom urlconf of the project if it's defined.
244+
245+
:param project: Project object to get the root URL from
246+
:param external_version_slug: If given, resolve using the external version domain.
247+
"""
248+
canonical_project = self._get_canonical_project(project)
249+
use_custom_domain = self._use_cname(canonical_project)
250+
custom_domain = canonical_project.get_canonical_custom_domain()
251+
if external_version_slug:
252+
domain = self._get_external_subdomain(
253+
canonical_project, external_version_slug
254+
)
255+
use_https = settings.PUBLIC_DOMAIN_USES_HTTPS
256+
elif use_custom_domain and custom_domain:
257+
domain = custom_domain.domain
258+
use_https = custom_domain.https
259+
else:
260+
domain = self._get_project_subdomain(canonical_project)
261+
use_https = settings.PUBLIC_DOMAIN_USES_HTTPS
262+
263+
protocol = "https" if use_https else "http"
264+
path = self._get_path_prefix(project)
265+
return urlunparse((protocol, domain, path, "", "", ""))
266+
217267
def _get_canonical_project_data(self, project):
218268
"""
219269
Returns a tuple with (project, subproject_slug) from the canonical project of `project`.

readthedocs/core/utils/url.py

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
"""URL hadling utilities."""
2+
3+
4+
def unsafe_join_url_path(base, *args):
5+
"""
6+
Joins a base URL path with one or more path components.
7+
8+
This does a simple join of the base path with the path components,
9+
inserting a slash between each component.
10+
The resulting path will always start with a slash.
11+
12+
.. warning::
13+
14+
This does not offer protection against directory traversal attacks,
15+
it simply joins the path components together. This shouldn't be used
16+
to serve files, use ``readthedocs.storage.utils.safe_join`` for that.
17+
"""
18+
base = "/" + base.lstrip("/")
19+
for path in args:
20+
base = base.rstrip("/") + "/" + path.lstrip("/")
21+
return base

readthedocs/projects/models.py

+18-4
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
from readthedocs.core.history import ExtraHistoricalRecords
3232
from readthedocs.core.resolver import resolve, resolve_domain
3333
from readthedocs.core.utils import slugify
34+
from readthedocs.core.utils.url import unsafe_join_url_path
3435
from readthedocs.domains.querysets import DomainQueryset
3536
from readthedocs.projects import constants
3637
from readthedocs.projects.exceptions import ProjectConfigurationError
@@ -645,8 +646,8 @@ def proxied_api_host(self):
645646
if self.urlconf:
646647
# Add our proxied api host at the first place we have a $variable
647648
# This supports both subpaths & normal root hosting
648-
url_prefix = self.urlconf.split('$', 1)[0]
649-
return '/' + url_prefix.strip('/') + '/_'
649+
path_prefix = self.custom_path_prefix
650+
return unsafe_join_url_path(path_prefix, "/_")
650651
return '/_'
651652

652653
@property
@@ -663,6 +664,19 @@ def proxied_static_path(self):
663664
"""Path for static files hosted on the user's doc domain."""
664665
return f"{self.proxied_api_host}/static/"
665666

667+
@property
668+
def custom_path_prefix(self):
669+
"""
670+
Get the path prefix from the custom urlconf.
671+
672+
Returns `None` if the project doesn't have a custom urlconf.
673+
"""
674+
if self.urlconf:
675+
# Return the value before the first defined variable,
676+
# as that is a prefix and not part of our normal doc patterns.
677+
return self.urlconf.split("$", 1)[0]
678+
return None
679+
666680
@property
667681
def regex_urlconf(self):
668682
"""
@@ -768,12 +782,12 @@ class ProxitoURLConf:
768782

769783
return ProxitoURLConf
770784

771-
@property
785+
@cached_property
772786
def is_subproject(self):
773787
"""Return whether or not this project is a subproject."""
774788
return self.superprojects.exists()
775789

776-
@property
790+
@cached_property
777791
def superproject(self):
778792
relationship = self.get_parent_relationship()
779793
if relationship:

readthedocs/proxito/README.rst

+38-2
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,43 @@ Module in charge of serving documentation pages.
55

66
Read the Docs core team members can view the `Proxito design doc <https://github.com/readthedocs/el-proxito/blob/master/docs/design/architecture.rst>`_
77

8+
URL parts
9+
---------
10+
11+
In our code we use the following terms to refer to the different parts of the URL:
12+
13+
url:
14+
The full URL including the protocol, for example ``https://docs.readthedocs.io/en/latest/api/index.html``.
15+
path:
16+
The whole path from the URL without query arguments or fragment,
17+
for example ``/en/latest/api/index.html``.
18+
domain:
19+
The domain/subdomain without the protocol, for example ``docs.readthedocs.io``.
20+
language:
21+
The language of the documentation, for example ``en``.
22+
version:
23+
The version of the documentation, for example ``latest``.
24+
filename:
25+
The name of the file being served, for example ``/api/index.html``.
26+
path prefix:
27+
The path prefix of the URL without version or language,
28+
for a normal project this is ``/``, and for subprojects this is ``/projects/<subproject-alias>/``.
29+
This prefix can be different for project defining their own urlconf.
30+
31+
.. code:: text
32+
33+
URL
34+
|----------------------------------------------------|
35+
path
36+
|-------------------------|
37+
https://docs.readthedocs.io/en/latest/api/index.html
38+
|-------|-------------------|--|------|--------------|
39+
protocol | | | |
40+
domain | | |
41+
language | |
42+
version |
43+
filename
44+
845
CDN
946
---
1047

@@ -39,9 +76,7 @@ What can/can't be cached?
3976

4077
- ServeRobotsTXT: can be cached, we don't serve a custom robots.txt
4178
to any user if the default version is private.
42-
This view is already cached at the application level.
4379
- ServeSitemapXML: can be cached. It displays only public versions, for everyone.
44-
This view is already cached at the application level.
4580
- ServeStaticFiles: can be cached, all files are the same for all projects and users.
4681
- Embed API: can be cached for public versions.
4782
- Search:
@@ -54,3 +89,4 @@ What can/can't be cached?
5489
- If the project doesn't have subprojects.
5590
- All subprojects are public.
5691
- Analytics API: can't be cached, we want to always hit our serves with this one.
92+
- Health check view: shouldn't be cached, we always want to hit our serves with this one.

readthedocs/proxito/tests/test_full.py

+11-8
Original file line numberDiff line numberDiff line change
@@ -1479,13 +1479,13 @@ def test_cache_on_private_versions_custom_domain(self):
14791479
self.domain.save()
14801480
self._test_cache_control_header_project(expected_value='private', host=self.domain.domain)
14811481

1482-
# HTTPS redirect can always be cached.
1482+
# HTTPS redirects can always be cached.
14831483
resp = self.client.get(
14841484
"/en/latest/", secure=False, HTTP_HOST=self.domain.domain
14851485
)
14861486
self.assertEqual(resp["Location"], f"https://{self.domain.domain}/en/latest/")
14871487
self.assertEqual(resp.headers["CDN-Cache-Control"], "public")
1488-
self.assertEqual(resp.headers["Cache-Tag"], "project,project:latest")
1488+
self.assertEqual(resp.headers["Cache-Tag"], "project")
14891489

14901490
def test_cache_public_versions(self):
14911491
self.project.versions.update(privacy_level=PUBLIC)
@@ -1513,7 +1513,7 @@ def test_cache_on_private_versions_custom_domain_subproject(self):
15131513
self.domain.save()
15141514
self._test_cache_control_header_subproject(expected_value='private', host=self.domain.domain)
15151515

1516-
# HTTPS redirect can always be cached.
1516+
# HTTPS redirects can always be cached.
15171517
resp = self.client.get(
15181518
'/projects/subproject/en/latest/',
15191519
secure=False,
@@ -1524,7 +1524,7 @@ def test_cache_on_private_versions_custom_domain_subproject(self):
15241524
f"https://{self.domain.domain}/projects/subproject/en/latest/",
15251525
)
15261526
self.assertEqual(resp.headers["CDN-Cache-Control"], "public")
1527-
self.assertEqual(resp.headers["Cache-Tag"], "subproject,subproject:latest")
1527+
self.assertEqual(resp.headers["Cache-Tag"], "project")
15281528

15291529
def test_cache_public_versions_subproject(self):
15301530
self.subproject.versions.update(privacy_level=PUBLIC)
@@ -1536,15 +1536,18 @@ def test_cache_public_versions_custom_domain(self):
15361536
self.domain.save()
15371537
self._test_cache_control_header_subproject(expected_value='public', host=self.domain.domain)
15381538

1539-
# HTTPS redirect respects the privacy level of the version.
1539+
# HTTPS redirects can always be cached.
15401540
resp = self.client.get(
15411541
'/projects/subproject/en/latest/',
15421542
secure=False,
15431543
HTTP_HOST=self.domain.domain,
15441544
)
1545-
self.assertEqual(resp['Location'], f'https://{self.domain.domain}/projects/subproject/en/latest/')
1546-
self.assertEqual(resp.headers['CDN-Cache-Control'], 'public')
1547-
self.assertEqual(resp.headers['Cache-Tag'], 'subproject,subproject:latest')
1545+
self.assertEqual(
1546+
resp["Location"],
1547+
f"https://{self.domain.domain}/projects/subproject/en/latest/",
1548+
)
1549+
self.assertEqual(resp.headers["CDN-Cache-Control"], "public")
1550+
self.assertEqual(resp.headers["Cache-Tag"], "project")
15481551

15491552
def test_cache_disable_on_rtd_header_resolved_project(self):
15501553
get(

readthedocs/proxito/tests/test_middleware.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -116,8 +116,12 @@ def test_subproject_redirect(self):
116116
)
117117
resp = self.client.get(self.url, HTTP_HOST="subproject.dev.readthedocs.io")
118118
self.assertEqual(resp.status_code, 302)
119-
self.assertEqual(resp["location"], f"http://pip.dev.readthedocs.io/")
120-
self.assertEqual(resp["X-RTD-Redirect"], RedirectType.to_canonical_domain.name)
119+
self.assertEqual(
120+
resp["location"], f"http://pip.dev.readthedocs.io/projects/subproject/"
121+
)
122+
self.assertEqual(
123+
resp["X-RTD-Redirect"], RedirectType.subproject_to_main_domain.name
124+
)
121125

122126
# We are not canonicalizing custom domains -> public domain for now
123127
@pytest.mark.xfail(strict=True)

readthedocs/proxito/tests/test_redirects.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,17 @@ def test_subproject_redirect(self):
8383
r = self.client.get('/', HTTP_HOST='subproject.dev.readthedocs.io')
8484
self.assertEqual(r.status_code, 302)
8585
self.assertEqual(
86-
r['Location'], 'https://project.dev.readthedocs.io/projects/subproject/en/latest/',
86+
r["Location"],
87+
"https://project.dev.readthedocs.io/projects/subproject/",
88+
)
89+
90+
r = self.client.get(
91+
"/projects/subproject/", HTTP_HOST="project.dev.readthedocs.io"
92+
)
93+
self.assertEqual(r.status_code, 302)
94+
self.assertEqual(
95+
r["Location"],
96+
"https://project.dev.readthedocs.io/projects/subproject/en/latest/",
8797
)
8898

8999
r = self.client.get('/en/latest/', HTTP_HOST='subproject.dev.readthedocs.io')

readthedocs/proxito/views/mixins.py

+25-24
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@
1919
from readthedocs.analytics.utils import get_client_ip
2020
from readthedocs.audit.models import AuditLog
2121
from readthedocs.builds.constants import EXTERNAL, INTERNAL
22-
from readthedocs.core.resolver import resolve
22+
from readthedocs.core.resolver import resolve, resolver
23+
from readthedocs.core.utils.url import unsafe_join_url_path
2324
from readthedocs.projects.constants import MEDIA_TYPE_HTML
2425
from readthedocs.proxito.constants import RedirectType
2526
from readthedocs.redirects.exceptions import InfiniteRedirectException
@@ -339,10 +340,8 @@ def canonical_redirect(
339340
self,
340341
request,
341342
final_project,
342-
version_slug,
343-
filename,
344343
redirect_type,
345-
is_external_version=False,
344+
external_version_slug=None,
346345
):
347346
"""
348347
Return a redirect to the canonical domain including scheme.
@@ -360,34 +359,34 @@ def canonical_redirect(
360359
361360
:param request: Request object.
362361
:param final_project: The current project being served.
363-
:param version_slug: The current version slug being served.
364-
:param filename: The filename being served.
365362
:param redirect_type: The type of canonical redirect (https, canonical-cname, subproject-main-domain)
366-
:param external: If the version is from a pull request preview.
363+
:param external_version_slug: The version slug if the request is from a pull request preview.
367364
"""
368365
from_url = request.build_absolute_uri()
369366
parsed_from = urlparse(from_url)
370367

371368
if redirect_type == RedirectType.http_to_https:
369+
# We only need to change the protocol.
372370
to = parsed_from._replace(scheme="https").geturl()
373-
elif redirect_type in [
374-
RedirectType.to_canonical_domain,
375-
RedirectType.subproject_to_main_domain,
376-
]:
377-
to = resolve(
371+
elif redirect_type == RedirectType.to_canonical_domain:
372+
# We need to change the domain and protocol.
373+
canonical_domain = final_project.get_canonical_custom_domain()
374+
protocol = "https" if canonical_domain.https else "http"
375+
to = parsed_from._replace(
376+
scheme=protocol, netloc=canonical_domain.domain
377+
).geturl()
378+
elif redirect_type == RedirectType.subproject_to_main_domain:
379+
# We need to get the subproject root in the domain of the main
380+
# project, and append the current path.
381+
project_doc_prefix = resolver.get_url_prefix(
378382
project=final_project,
379-
version_slug=version_slug,
380-
filename=filename,
381-
query_params=parsed_from.query,
382-
external=is_external_version,
383+
external_version_slug=external_version_slug,
383384
)
384-
# When a canonical redirect is done, only change the domain.
385-
if redirect_type == RedirectType.to_canonical_domain:
386-
parsed_to = urlparse(to)
387-
to = parsed_from._replace(
388-
scheme=parsed_to.scheme,
389-
netloc=parsed_to.netloc,
390-
).geturl()
385+
parsed_doc_prefix = urlparse(project_doc_prefix)
386+
to = parsed_doc_prefix._replace(
387+
path=unsafe_join_url_path(parsed_doc_prefix.path, parsed_from.path),
388+
query=parsed_from.query,
389+
).geturl()
391390
else:
392391
raise NotImplementedError
393392

@@ -399,7 +398,9 @@ def canonical_redirect(
399398
)
400399
raise InfiniteRedirectException()
401400

402-
log.info('Canonical Redirect.', host=request.get_host(), from_url=filename, to_url=to)
401+
log.info(
402+
"Canonical Redirect.", host=request.get_host(), from_url=from_url, to_url=to
403+
)
403404
# All canonical redirects can be cached, since the final URL will check for authz.
404405
self.cache_response = True
405406
resp = HttpResponseRedirect(to)

0 commit comments

Comments
 (0)