Skip to content

Commit 141f89d

Browse files
authored
Unresolver: strict validation for external versions and other fixes (#9534)
1 parent 34e76e4 commit 141f89d

File tree

4 files changed

+136
-41
lines changed

4 files changed

+136
-41
lines changed

readthedocs/core/unresolver.py

+93-32
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import re
22
from dataclasses import dataclass
3-
from urllib.parse import urlparse
3+
from urllib.parse import ParseResult, urlparse
44

55
import structlog
66
from django.conf import settings
@@ -27,8 +27,7 @@ class UnresolvedURL:
2727

2828
version: Version = None
2929
filename: str = None
30-
query: str = None
31-
fragment: str = None
30+
parsed_url: ParseResult = None
3231
domain: Domain = None
3332
external: bool = False
3433

@@ -41,59 +40,92 @@ class Unresolver:
4140
# - /en/latest/
4241
# - /en/latest/file/name/
4342
multiversion_pattern = re.compile(
44-
r"^/(?P<language>{lang_slug})(/((?P<version>{version_slug})(/(?P<file>{filename_slug}))?)?)?$".format( # noqa
43+
r"""
44+
^/(?P<language>{lang_slug}) # Must have the language slug.
45+
(/((?P<version>{version_slug})(/(?P<file>{filename_slug}))?)?)?$ # Optionally a version followed by a file. # noqa
46+
""".format(
4547
**pattern_opts
46-
)
48+
),
49+
re.VERBOSE,
4750
)
4851

4952
# This pattern matches:
5053
# - /projects/subproject
5154
# - /projects/subproject/
5255
# - /projects/subproject/file/name/
5356
subproject_pattern = re.compile(
54-
r"^/projects/(?P<project>{project_slug}+)(/(?P<file>{filename_slug}))?$".format(
57+
r"""
58+
^/projects/ # Must have the `projects` prefix.
59+
(?P<project>{project_slug}+) # Followed by the subproject alias.
60+
(/(?P<file>{filename_slug}))?$ # Optionally a filename, which will be recursively resolved.
61+
""".format(
5562
**pattern_opts
56-
)
63+
),
64+
re.VERBOSE,
5765
)
5866

59-
def unresolve(self, url, add_index=True):
67+
def unresolve(self, url, append_indexhtml=True):
6068
"""
6169
Turn a URL into the component parts that our views would use to process them.
6270
6371
This is useful for lots of places,
6472
like where we want to figure out exactly what file a URL maps to.
6573
6674
:param url: Full URL to unresolve (including the protocol and domain part).
67-
:param add_index: If `True` the filename will be normalized
75+
:param append_indexhtml: If `True` directories will be normalized
6876
to end with ``/index.html``.
6977
"""
7078
parsed = urlparse(url)
7179
domain = self.get_domain_from_host(parsed.netloc)
72-
project_slug, domain_object, external = self.unresolve_domain(domain)
73-
if not project_slug:
80+
(
81+
parent_project_slug,
82+
domain_object,
83+
external_version_slug,
84+
) = self.unresolve_domain(domain)
85+
if not parent_project_slug:
7486
return None
7587

76-
parent_project = Project.objects.filter(slug=project_slug).first()
88+
parent_project = Project.objects.filter(slug=parent_project_slug).first()
7789
if not parent_project:
7890
return None
7991

80-
project, version, filename = self._unresolve_path(
92+
current_project, version, filename = self._unresolve_path(
8193
parent_project=parent_project,
8294
path=parsed.path,
8395
)
8496

85-
if add_index and filename and filename.endswith("/"):
97+
# Make sure we are serving the external version from the subdomain.
98+
if external_version_slug and version:
99+
if external_version_slug != version.slug:
100+
log.warning(
101+
"Invalid version for external domain.",
102+
domain=domain,
103+
version_slug=version.slug,
104+
)
105+
version = None
106+
filename = None
107+
elif not version.is_external:
108+
log.warning(
109+
"Attempt of serving a non-external version from RTD_EXTERNAL_VERSION_DOMAIN.",
110+
domain=domain,
111+
version_slug=version.slug,
112+
version_type=version.type,
113+
url=url,
114+
)
115+
version = None
116+
filename = None
117+
118+
if append_indexhtml and filename and filename.endswith("/"):
86119
filename += "index.html"
87120

88121
return UnresolvedURL(
89122
parent_project=parent_project,
90-
project=project or parent_project,
123+
project=current_project or parent_project,
91124
version=version,
92125
filename=filename,
93-
query=parsed.query,
94-
fragment=parsed.fragment,
126+
parsed_url=parsed,
95127
domain=domain_object,
96-
external=external,
128+
external=bool(external_version_slug),
97129
)
98130

99131
@staticmethod
@@ -109,7 +141,11 @@ def _match_multiversion_project(self, parent_project, path):
109141
Try to match a multiversion project.
110142
111143
If the translation exists, we return a result even if the version doesn't,
112-
so the translation is taken as the canonical project (useful for 404 pages).
144+
so the translation is taken as the current project (useful for 404 pages).
145+
146+
:returns: None or a tuple with the current project, version and file.
147+
A tuple with only the project means we weren't able to find a version,
148+
but the translation was correct.
113149
"""
114150
match = self.multiversion_pattern.match(path)
115151
if not match:
@@ -138,24 +174,40 @@ def _match_subproject(self, parent_project, path):
138174
139175
If the subproject exists, we try to resolve the rest of the path
140176
with the subproject as the canonical project.
177+
178+
If the subproject exists, we return a result even if version doesn't,
179+
so the subproject is taken as the current project (useful for 404 pages).
180+
181+
:returns: None or a tuple with the current project, version and file.
182+
A tuple with only the project means we were able to find the subproject,
183+
but we weren't able to resolve the rest of the path.
141184
"""
142185
match = self.subproject_pattern.match(path)
143186
if not match:
144187
return None
145188

146-
project_slug = match.group("project")
189+
subproject_alias = match.group("project")
147190
file = self._normalize_filename(match.group("file"))
148191
project_relationship = (
149-
parent_project.subprojects.filter(alias=project_slug)
192+
parent_project.subprojects.filter(alias=subproject_alias)
150193
.prefetch_related("child")
151194
.first()
152195
)
153196
if project_relationship:
154-
return self._unresolve_path(
155-
parent_project=project_relationship.child,
197+
# We use the subproject as the new parent project
198+
# to resolve the rest of the path relative to it.
199+
subproject = project_relationship.child
200+
response = self._unresolve_path(
201+
parent_project=subproject,
156202
path=file,
157203
check_subprojects=False,
158204
)
205+
# If we got a valid response, return that,
206+
# otherwise return the current subproject
207+
# as the current project without a valid version or path.
208+
if response:
209+
return response
210+
return subproject, None, None
159211
return None
160212

161213
def _match_single_version_project(self, parent_project, path):
@@ -182,10 +234,19 @@ def _unresolve_path(self, parent_project, path, check_subprojects=True):
182234
If the returned version is `None`, then we weren't able to
183235
unresolve the path into a valid version of the project.
184236
237+
The checks are done in the following order:
238+
239+
- Check for multiple versions if the parent project
240+
isn't a single version project.
241+
- Check for subprojects.
242+
- Check for single versions if the parent project isn’t
243+
a multi version project.
244+
185245
:param parent_project: The project that owns the path.
186246
:param path: The path to unresolve.
187247
:param check_subprojects: If we should check for subprojects,
188-
this is used to call this function recursively.
248+
this is used to call this function recursively when
249+
resolving the path from a subproject (we don't support subprojects of subprojects).
189250
190251
:returns: A tuple with: project, version, and file name.
191252
"""
@@ -216,7 +277,7 @@ def _unresolve_path(self, parent_project, path, check_subprojects=True):
216277
if response:
217278
return response
218279

219-
return None, None, None
280+
return parent_project, None, None
220281

221282
@staticmethod
222283
def get_domain_from_host(host):
@@ -234,8 +295,8 @@ def unresolve_domain(self, domain):
234295
Unresolve domain by extracting relevant information from it.
235296
236297
:param str domain: Domain to extract the information from.
237-
:returns: A tuple with: the project slug, domain object, and if the domain
238-
is from an external version.
298+
:returns: A tuple with: the project slug, domain object, and the
299+
external version slug if the domain is from an external version.
239300
"""
240301
public_domain = self.get_domain_from_host(settings.PUBLIC_DOMAIN)
241302
external_domain = self.get_domain_from_host(
@@ -250,22 +311,22 @@ def unresolve_domain(self, domain):
250311
if public_domain == root_domain:
251312
project_slug = subdomain
252313
log.debug("Public domain.", domain=domain)
253-
return project_slug, None, False
314+
return project_slug, None, None
254315

255316
# TODO: This can catch some possibly valid domains (docs.readthedocs.io.com)
256317
# for example, but these might be phishing, so let's ignore them for now.
257318
log.warning("Weird variation of our domain.", domain=domain)
258-
return None, None, False
319+
return None, None, None
259320

260321
# Serve PR builds on external_domain host.
261322
if external_domain == root_domain:
262323
try:
324+
project_slug, version_slug = subdomain.rsplit("--", maxsplit=1)
263325
log.debug("External versions domain.", domain=domain)
264-
project_slug, _ = subdomain.rsplit("--", maxsplit=1)
265-
return project_slug, None, True
326+
return project_slug, None, version_slug
266327
except ValueError:
267328
log.info("Invalid format of external versions domain.", domain=domain)
268-
return None, None, False
329+
return None, None, None
269330

270331
# Custom domain.
271332
domain_object = (

readthedocs/embed/views.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ def get(self, request):
8181
if url:
8282
unresolved = self.unresolved_url
8383
path = unresolved.filename
84-
section = unresolved.fragment
84+
section = unresolved.parsed_url.fragment
8585
elif not path and not doc:
8686
return Response(
8787
{

readthedocs/proxito/middleware.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,9 @@ def map_host_to_project_slug(request): # pylint: disable=too-many-return-statem
5353
log.info('Setting project based on X_RTD_SLUG header.', project_slug=project_slug)
5454
return project_slug
5555

56-
project_slug, domain_object, external = unresolver.unresolve_domain(host)
56+
project_slug, domain_object, external_version_slug = unresolver.unresolve_domain(
57+
host
58+
)
5759
if not project_slug:
5860
# Block domains that look like ours, may be phishing.
5961
if external_domain in host or public_domain in host:
@@ -85,11 +87,9 @@ def map_host_to_project_slug(request): # pylint: disable=too-many-return-statem
8587
return project_slug
8688

8789
# Pull request previews.
88-
if external:
89-
subdomain, _ = host.split(".", maxsplit=1)
90-
_, version_slug = subdomain.rsplit("--", maxsplit=1)
90+
if external_version_slug:
9191
request.external_domain = True
92-
request.host_version_slug = version_slug
92+
request.host_version_slug = external_version_slug
9393
log.debug("Proxito External Version Domain.", host=host)
9494
return project_slug
9595

readthedocs/rtd_tests/tests/test_unresolver.py

+37-3
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,13 @@ def test_unresolver(self):
2525
self.assertEqual(parts.project, self.pip)
2626
self.assertEqual(parts.version, self.version)
2727
self.assertEqual(parts.filename, "/foo.html")
28-
self.assertEqual(parts.fragment, "fragment")
29-
self.assertEqual(parts.query, "search=api")
28+
self.assertEqual(parts.parsed_url.fragment, "fragment")
29+
self.assertEqual(parts.parsed_url.query, "search=api")
3030

3131
def test_filename_wihtout_index(self):
32-
parts = unresolve("https://pip.readthedocs.io/en/latest/file/", add_index=False)
32+
parts = unresolve(
33+
"https://pip.readthedocs.io/en/latest/file/", append_indexhtml=False
34+
)
3335
self.assertEqual(parts.parent_project, self.pip)
3436
self.assertEqual(parts.project, self.pip)
3537
self.assertEqual(parts.version, self.version)
@@ -108,6 +110,20 @@ def test_unresolve_subproject_alias(self):
108110
self.assertEqual(parts.version, self.subproject_version)
109111
self.assertEqual(parts.filename, "/index.html")
110112

113+
def test_unresolve_subproject_invalid_version(self):
114+
parts = unresolve("https://pip.readthedocs.io/projects/sub/ja/nothing/foo.html")
115+
self.assertEqual(parts.parent_project, self.pip)
116+
self.assertEqual(parts.project, self.subproject)
117+
self.assertEqual(parts.version, None)
118+
self.assertEqual(parts.filename, None)
119+
120+
def test_unresolve_subproject_invalid_translation(self):
121+
parts = unresolve("https://pip.readthedocs.io/projects/sub/es/latest/foo.html")
122+
self.assertEqual(parts.parent_project, self.pip)
123+
self.assertEqual(parts.project, self.subproject)
124+
self.assertEqual(parts.version, None)
125+
self.assertEqual(parts.filename, None)
126+
111127
def test_unresolver_translation(self):
112128
parts = unresolve("https://pip.readthedocs.io/ja/latest/foo.html")
113129
self.assertEqual(parts.parent_project, self.pip)
@@ -196,6 +212,24 @@ def test_unresolve_external_version_no_existing_version(self):
196212
self.assertEqual(parts.version, None)
197213
self.assertEqual(parts.filename, None)
198214

215+
def test_external_version_not_matching_final_version(self):
216+
parts = unresolve("https://pip--10.dev.readthedocs.build/en/latest/")
217+
self.assertEqual(parts.parent_project, self.pip)
218+
self.assertEqual(parts.project, self.pip)
219+
self.assertEqual(parts.version, None)
220+
self.assertEqual(parts.filename, None)
221+
222+
def test_normal_version_in_external_version_subdomain(self):
223+
parts = unresolve("https://pip--latest.dev.readthedocs.build/en/latest/")
224+
self.assertEqual(parts.parent_project, self.pip)
225+
self.assertEqual(parts.project, self.pip)
226+
self.assertEqual(parts.version, None)
227+
self.assertEqual(parts.filename, None)
228+
229+
def test_malformed_external_version(self):
230+
parts = unresolve("https://pip-latest.dev.readthedocs.build/en/latest/")
231+
self.assertEqual(parts, None)
232+
199233
def test_unresolver_unknown_host(self):
200234
parts = unresolve("https://random.stuff.com/en/latest/")
201235
self.assertEqual(parts, None)

0 commit comments

Comments
 (0)