Skip to content

Unresolver: strict validation for external versions and other fixes #9534

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
Sep 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 93 additions & 32 deletions readthedocs/core/unresolver.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re
from dataclasses import dataclass
from urllib.parse import urlparse
from urllib.parse import ParseResult, urlparse

import structlog
from django.conf import settings
Expand All @@ -27,8 +27,7 @@ class UnresolvedURL:

version: Version = None
filename: str = None
query: str = None
fragment: str = None
parsed_url: ParseResult = None
domain: Domain = None
external: bool = False

Expand All @@ -41,59 +40,92 @@ class Unresolver:
# - /en/latest/
# - /en/latest/file/name/
multiversion_pattern = re.compile(
r"^/(?P<language>{lang_slug})(/((?P<version>{version_slug})(/(?P<file>{filename_slug}))?)?)?$".format( # noqa
r"""
^/(?P<language>{lang_slug}) # Must have the language slug.
(/((?P<version>{version_slug})(/(?P<file>{filename_slug}))?)?)?$ # Optionally a version followed by a file. # noqa
""".format(
**pattern_opts
)
),
re.VERBOSE,
)

# This pattern matches:
# - /projects/subproject
# - /projects/subproject/
# - /projects/subproject/file/name/
subproject_pattern = re.compile(
r"^/projects/(?P<project>{project_slug}+)(/(?P<file>{filename_slug}))?$".format(
r"""
^/projects/ # Must have the `projects` prefix.
(?P<project>{project_slug}+) # Followed by the subproject alias.
(/(?P<file>{filename_slug}))?$ # Optionally a filename, which will be recursively resolved.
""".format(
**pattern_opts
)
),
re.VERBOSE,
)

def unresolve(self, url, add_index=True):
def unresolve(self, url, append_indexhtml=True):
"""
Turn a URL into the component parts that our views would use to process them.

This is useful for lots of places,
like where we want to figure out exactly what file a URL maps to.

:param url: Full URL to unresolve (including the protocol and domain part).
:param add_index: If `True` the filename will be normalized
:param append_indexhtml: If `True` directories will be normalized
to end with ``/index.html``.
"""
parsed = urlparse(url)
domain = self.get_domain_from_host(parsed.netloc)
project_slug, domain_object, external = self.unresolve_domain(domain)
if not project_slug:
(
parent_project_slug,
domain_object,
external_version_slug,
) = self.unresolve_domain(domain)
if not parent_project_slug:
return None

parent_project = Project.objects.filter(slug=project_slug).first()
parent_project = Project.objects.filter(slug=parent_project_slug).first()
if not parent_project:
return None

project, version, filename = self._unresolve_path(
current_project, version, filename = self._unresolve_path(
parent_project=parent_project,
path=parsed.path,
)

if add_index and filename and filename.endswith("/"):
# Make sure we are serving the external version from the subdomain.
if external_version_slug and version:
if external_version_slug != version.slug:
log.warning(
"Invalid version for external domain.",
domain=domain,
version_slug=version.slug,
)
version = None
filename = None
Comment on lines +105 to +106
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we do something more than set these to None here? Is raising an exception better, rather than having the caller need to know what None means on these fields?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Setting version to None will be interpreted as like the version wasn't found, but we could return None, or raise an exception, since those URLs will be generated by manually changing the URL.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought about this as well, and I'm not sure what's my position here.

I'd like to raise exceptions for all these cases were we return None for not found things. However, I'm not sure it's possible for all the cases because IIRC there are other places were project, None, None is useful to continue moving forward.

If that's not the case and I'm confused here, I'm 👍🏼 on standardizing this and always raise a detailed exception for each case that we can clearly handle from outside.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, basically this would be interpreted as a 404, like if the version wouldn't exist.

elif not version.is_external:
log.warning(
"Attempt of serving a non-external version from RTD_EXTERNAL_VERSION_DOMAIN.",
domain=domain,
version_slug=version.slug,
version_type=version.type,
url=url,
)
version = None
filename = None

if append_indexhtml and filename and filename.endswith("/"):
filename += "index.html"

return UnresolvedURL(
parent_project=parent_project,
project=project or parent_project,
project=current_project or parent_project,
version=version,
filename=filename,
query=parsed.query,
fragment=parsed.fragment,
parsed_url=parsed,
domain=domain_object,
external=external,
external=bool(external_version_slug),
)

@staticmethod
Expand All @@ -109,7 +141,11 @@ def _match_multiversion_project(self, parent_project, path):
Try to match a multiversion project.

If the translation exists, we return a result even if the version doesn't,
so the translation is taken as the canonical project (useful for 404 pages).
so the translation is taken as the current project (useful for 404 pages).

:returns: None or a tuple with the current project, version and file.
A tuple with only the project means we weren't able to find a version,
but the translation was correct.
"""
match = self.multiversion_pattern.match(path)
if not match:
Expand Down Expand Up @@ -138,24 +174,40 @@ def _match_subproject(self, parent_project, path):

If the subproject exists, we try to resolve the rest of the path
with the subproject as the canonical project.

If the subproject exists, we return a result even if version doesn't,
so the subproject is taken as the current project (useful for 404 pages).

:returns: None or a tuple with the current project, version and file.
A tuple with only the project means we were able to find the subproject,
but we weren't able to resolve the rest of the path.
"""
match = self.subproject_pattern.match(path)
if not match:
return None

project_slug = match.group("project")
subproject_alias = match.group("project")
file = self._normalize_filename(match.group("file"))
project_relationship = (
parent_project.subprojects.filter(alias=project_slug)
parent_project.subprojects.filter(alias=subproject_alias)
.prefetch_related("child")
.first()
)
if project_relationship:
return self._unresolve_path(
parent_project=project_relationship.child,
# We use the subproject as the new parent project
# to resolve the rest of the path relative to it.
subproject = project_relationship.child
response = self._unresolve_path(
parent_project=subproject,
path=file,
check_subprojects=False,
)
# If we got a valid response, return that,
# otherwise return the current subproject
# as the current project without a valid version or path.
if response:
return response
return subproject, None, None
return None

def _match_single_version_project(self, parent_project, path):
Expand All @@ -182,10 +234,19 @@ def _unresolve_path(self, parent_project, path, check_subprojects=True):
If the returned version is `None`, then we weren't able to
unresolve the path into a valid version of the project.

The checks are done in the following order:

- Check for multiple versions if the parent project
isn't a single version project.
- Check for subprojects.
- Check for single versions if the parent project isn’t
a multi version project.

:param parent_project: The project that owns the path.
:param path: The path to unresolve.
:param check_subprojects: If we should check for subprojects,
this is used to call this function recursively.
this is used to call this function recursively when
resolving the path from a subproject (we don't support subprojects of subprojects).

:returns: A tuple with: project, version, and file name.
"""
Expand Down Expand Up @@ -216,7 +277,7 @@ def _unresolve_path(self, parent_project, path, check_subprojects=True):
if response:
return response

return None, None, None
return parent_project, None, None

@staticmethod
def get_domain_from_host(host):
Expand All @@ -234,8 +295,8 @@ def unresolve_domain(self, domain):
Unresolve domain by extracting relevant information from it.

:param str domain: Domain to extract the information from.
:returns: A tuple with: the project slug, domain object, and if the domain
is from an external version.
:returns: A tuple with: the project slug, domain object, and the
external version slug if the domain is from an external version.
"""
public_domain = self.get_domain_from_host(settings.PUBLIC_DOMAIN)
external_domain = self.get_domain_from_host(
Expand All @@ -250,22 +311,22 @@ def unresolve_domain(self, domain):
if public_domain == root_domain:
project_slug = subdomain
log.debug("Public domain.", domain=domain)
return project_slug, None, False
return project_slug, None, None

# TODO: This can catch some possibly valid domains (docs.readthedocs.io.com)
# for example, but these might be phishing, so let's ignore them for now.
log.warning("Weird variation of our domain.", domain=domain)
return None, None, False
return None, None, None

# Serve PR builds on external_domain host.
if external_domain == root_domain:
try:
project_slug, version_slug = subdomain.rsplit("--", maxsplit=1)
log.debug("External versions domain.", domain=domain)
project_slug, _ = subdomain.rsplit("--", maxsplit=1)
return project_slug, None, True
return project_slug, None, version_slug
except ValueError:
log.info("Invalid format of external versions domain.", domain=domain)
return None, None, False
return None, None, None

# Custom domain.
domain_object = (
Expand Down
2 changes: 1 addition & 1 deletion readthedocs/embed/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def get(self, request):
if url:
unresolved = self.unresolved_url
path = unresolved.filename
section = unresolved.fragment
section = unresolved.parsed_url.fragment
elif not path and not doc:
return Response(
{
Expand Down
10 changes: 5 additions & 5 deletions readthedocs/proxito/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,9 @@ def map_host_to_project_slug(request): # pylint: disable=too-many-return-statem
log.info('Setting project based on X_RTD_SLUG header.', project_slug=project_slug)
return project_slug

project_slug, domain_object, external = unresolver.unresolve_domain(host)
project_slug, domain_object, external_version_slug = unresolver.unresolve_domain(
host
)
if not project_slug:
# Block domains that look like ours, may be phishing.
if external_domain in host or public_domain in host:
Expand Down Expand Up @@ -85,11 +87,9 @@ def map_host_to_project_slug(request): # pylint: disable=too-many-return-statem
return project_slug

# Pull request previews.
if external:
subdomain, _ = host.split(".", maxsplit=1)
_, version_slug = subdomain.rsplit("--", maxsplit=1)
if external_version_slug:
request.external_domain = True
request.host_version_slug = version_slug
request.host_version_slug = external_version_slug
log.debug("Proxito External Version Domain.", host=host)
return project_slug

Expand Down
40 changes: 37 additions & 3 deletions readthedocs/rtd_tests/tests/test_unresolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,13 @@ def test_unresolver(self):
self.assertEqual(parts.project, self.pip)
self.assertEqual(parts.version, self.version)
self.assertEqual(parts.filename, "/foo.html")
self.assertEqual(parts.fragment, "fragment")
self.assertEqual(parts.query, "search=api")
self.assertEqual(parts.parsed_url.fragment, "fragment")
self.assertEqual(parts.parsed_url.query, "search=api")

def test_filename_wihtout_index(self):
parts = unresolve("https://pip.readthedocs.io/en/latest/file/", add_index=False)
parts = unresolve(
"https://pip.readthedocs.io/en/latest/file/", append_indexhtml=False
)
self.assertEqual(parts.parent_project, self.pip)
self.assertEqual(parts.project, self.pip)
self.assertEqual(parts.version, self.version)
Expand Down Expand Up @@ -108,6 +110,20 @@ def test_unresolve_subproject_alias(self):
self.assertEqual(parts.version, self.subproject_version)
self.assertEqual(parts.filename, "/index.html")

def test_unresolve_subproject_invalid_version(self):
parts = unresolve("https://pip.readthedocs.io/projects/sub/ja/nothing/foo.html")
self.assertEqual(parts.parent_project, self.pip)
self.assertEqual(parts.project, self.subproject)
self.assertEqual(parts.version, None)
self.assertEqual(parts.filename, None)

def test_unresolve_subproject_invalid_translation(self):
parts = unresolve("https://pip.readthedocs.io/projects/sub/es/latest/foo.html")
self.assertEqual(parts.parent_project, self.pip)
self.assertEqual(parts.project, self.subproject)
self.assertEqual(parts.version, None)
self.assertEqual(parts.filename, None)

def test_unresolver_translation(self):
parts = unresolve("https://pip.readthedocs.io/ja/latest/foo.html")
self.assertEqual(parts.parent_project, self.pip)
Expand Down Expand Up @@ -196,6 +212,24 @@ def test_unresolve_external_version_no_existing_version(self):
self.assertEqual(parts.version, None)
self.assertEqual(parts.filename, None)

def test_external_version_not_matching_final_version(self):
parts = unresolve("https://pip--10.dev.readthedocs.build/en/latest/")
self.assertEqual(parts.parent_project, self.pip)
self.assertEqual(parts.project, self.pip)
self.assertEqual(parts.version, None)
self.assertEqual(parts.filename, None)

def test_normal_version_in_external_version_subdomain(self):
parts = unresolve("https://pip--latest.dev.readthedocs.build/en/latest/")
self.assertEqual(parts.parent_project, self.pip)
self.assertEqual(parts.project, self.pip)
self.assertEqual(parts.version, None)
self.assertEqual(parts.filename, None)

def test_malformed_external_version(self):
parts = unresolve("https://pip-latest.dev.readthedocs.build/en/latest/")
self.assertEqual(parts, None)

def test_unresolver_unknown_host(self):
parts = unresolve("https://random.stuff.com/en/latest/")
self.assertEqual(parts, None)