Skip to content

Proxito: separate project slug extraction from request manipulation #9462

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Aug 18, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 109 additions & 63 deletions readthedocs/proxito/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,64 @@
log = structlog.get_logger(__name__) # noqa


def _get_domain_from_host(host):
"""
Get the normalized domain from a hostname.

A hostname can include the port.
"""
return host.lower().split(":")[0]


def _unresolve_domain(domain):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be good if this function logs "where did we get the project slug from?". Was it from a subdomain of public domain? a custom domain? a subdomain of the external domain?

It could be in info mode for now while we are working on this refactor, so we can check that things are working correctly in production. Then, we can lower it down to debug.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are also logged outside the function

log.debug('Proxito CNAME.', host=host)
, similar to #9462 (comment) I wasn't sure if we should logs this outside or inside the function, also, they are set as debug as the old logs.

If you think this makes more sense inside the function, let me know.

Copy link
Member

@ericholscher ericholscher Aug 18, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should move the logic into this function, since this function will also be called in other places other than proxito after it's moved to the unresolver, right?

"""
Unresolve domain by extracting relevant information from it.

:param str domain: Domain to extract the information from.
:returns: A tuple with: the project slug, domain object, and if the domain
Copy link
Member

@ericholscher ericholscher Aug 17, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This feels like an odd set of things to return.. Probably better to have an object here or something? 3 random values is hard to read in the code. But not a huge deal since it's an internal function I guess.

is from an external version.
"""
public_domain = _get_domain_from_host(settings.PUBLIC_DOMAIN)
external_domain = _get_domain_from_host(settings.RTD_EXTERNAL_VERSION_DOMAIN)

subdomain, *root_domain = domain.split(".", maxsplit=1)
root_domain = root_domain[0] if root_domain else ""

if public_domain in domain:
# Serve from the PUBLIC_DOMAIN, ensuring it looks like `foo.PUBLIC_DOMAIN`.
if public_domain == root_domain:
project_slug = subdomain
log.info("Public domain.", domain=domain)
return project_slug, None, False

# TODO: This can catch some possibly valid domains (docs.readthedocs.io.com) for example,
# but these might be phishing, so let's ignore them for now.
log.warning("Weird variation of our domain.", domain=domain)
return None, None, False

# Serve PR builds on external_domain host.
if external_domain == root_domain:
try:
log.info("External versions domain.", domain=domain)
project_slug, _ = subdomain.rsplit("--", maxsplit=1)
return project_slug, None, True
except ValueError:
log.info("Invalid format of external versions domain.", domain=domain)
return None, None, False

# Custom domain.
domain_object = (
Domain.objects.filter(domain=domain).prefetch_related("project").first()
)
if domain_object:
log.info("Custom domain.", domain=domain)
project_slug = domain_object.project.slug
return project_slug, domain_object, False

log.info("Invalid domain.", domain=domain)
return None, None, None


def map_host_to_project_slug(request): # pylint: disable=too-many-return-statements
"""
Take the request and map the host to the proper project slug.
Expand All @@ -38,76 +96,41 @@ def map_host_to_project_slug(request): # pylint: disable=too-many-return-statem
- This sets ``request.canonicalize`` with the value as the reason
"""

host = request.get_host().lower().split(':')[0]
public_domain = settings.PUBLIC_DOMAIN.lower().split(':')[0]
external_domain = settings.RTD_EXTERNAL_VERSION_DOMAIN.lower().split(':')[0]

host_parts = host.split('.')
public_domain_parts = public_domain.split('.')
external_domain_parts = external_domain.split('.')
host = _get_domain_from_host(request.get_host())
public_domain = _get_domain_from_host(settings.PUBLIC_DOMAIN)
external_domain = _get_domain_from_host(settings.RTD_EXTERNAL_VERSION_DOMAIN)

project_slug = None

# Explicit Project slug being passed in
# Explicit Project slug being passed in.
if 'HTTP_X_RTD_SLUG' in request.META:
project_slug = request.META['HTTP_X_RTD_SLUG'].lower()
if Project.objects.filter(slug=project_slug).exists():
request.rtdheader = True
log.info('Setting project based on X_RTD_SLUG header.', project_slug=project_slug)
return project_slug

if public_domain in host or host == 'proxito':
# Serve from the PUBLIC_DOMAIN, ensuring it looks like `foo.PUBLIC_DOMAIN`
if public_domain_parts == host_parts[1:]:
project_slug = host_parts[0]
request.subdomain = True
log.debug('Proxito Public Domain.', host=host)
if Domain.objects.filter(project__slug=project_slug).filter(
canonical=True,
https=True,
).exists():
log.debug('Proxito Public Domain -> Canonical Domain Redirect.', host=host)
request.canonicalize = constants.REDIRECT_CANONICAL_CNAME
elif (
ProjectRelationship.objects.
filter(child__slug=project_slug).exists()
):
log.debug('Proxito Public Domain -> Subproject Main Domain Redirect.', host=host)
request.canonicalize = constants.REDIRECT_SUBPROJECT_MAIN_DOMAIN
return project_slug

# TODO: This can catch some possibly valid domains (docs.readthedocs.io.com) for example
# But these feel like they might be phishing, etc. so let's block them for now.
log.warning('Weird variation on our hostname.', host=host)
return render(
request, 'core/dns-404.html', context={'host': host}, status=400
)

if external_domain in host:
# Serve custom versions on external-host-domain
if external_domain_parts == host_parts[1:]:
try:
project_slug, version_slug = host_parts[0].rsplit('--', 1)
request.external_domain = True
request.host_version_slug = version_slug
log.debug('Proxito External Version Domain.', host=host)
return project_slug
except ValueError:
log.warning('Weird variation on our hostname.', host=host)
return render(
request, 'core/dns-404.html', context={'host': host}, status=400
)
project_slug, domain_object, external = _unresolve_domain(host)
if not project_slug:
# Block domains that look like ours, may be phishing.
if external_domain in host or public_domain in host:
log.warning("Weird variation on our hostname.", host=host)
return render(
request,
"core/dns-404.html",
context={"host": host},
status=400,
)
# Some person is CNAMEing to us without configuring a domain - 404.
log.debug("CNAME 404.", host=host)
return render(request, "core/dns-404.html", context={"host": host}, status=404)

# Serve CNAMEs
domain = Domain.objects.filter(domain=host).first()
if domain:
project_slug = domain.project.slug
# Custom domain.
if domain_object:
request.cname = True
request.domain = domain
request.domain = domain_object
Comment on lines 128 to +129
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we need to do what I'm going to propose in this PR, but, can we avoid passing things by injecting things in the request? In case this is not possible, or it's our best option, can we define a specific well-documented object?

request.readthedocs = ReadTheDocsRequest(
  cname=bool,
  domain=Domain,
  external_domain=bool,
  host_version_slug=str,
  canonicalize=bool,
  subdomain=bool,
)

I think we have this kind of injections all over our code and I have no idea what are all of those possible values and how they are expected to be used. Each time I work on this code I need to grep some code to understand how they are used.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, my goal with the new implementation is also reducing the things we inject into the request, and pass them explicitly where needed if possible, but if we still require them using an object would be another good option.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea, it would be nice to standardize the data we're setting on the request. There's a few random places we're reading that data, so we need to be careful that we don't randomly break something downstream though.

log.debug('Proxito CNAME.', host=host)

if domain.https and not request.is_secure():
# Redirect HTTP -> HTTPS (302) for this custom domain
if domain_object.https and not request.is_secure():
# Redirect HTTP -> HTTPS (302) for this custom domain.
log.debug('Proxito CNAME HTTPS Redirect.', host=host)
request.canonicalize = constants.REDIRECT_HTTPS

Expand All @@ -116,11 +139,34 @@ def map_host_to_project_slug(request): # pylint: disable=too-many-return-statem

return project_slug

# Some person is CNAMEing to us without configuring a domain - 404.
log.debug('CNAME 404.', host=host)
return render(
request, 'core/dns-404.html', context={'host': host}, status=404
)
# Pull request previews.
if external:
subdomain, _ = host.split(".", maxsplit=1)
_, version_slug = subdomain.rsplit("--", maxsplit=1)
request.external_domain = True
request.host_version_slug = version_slug
log.debug("Proxito External Version Domain.", host=host)
return project_slug

# Normal doc serving.
request.subdomain = True
log.debug("Proxito Public Domain.", host=host)
if (
Domain.objects.filter(project__slug=project_slug)
.filter(
canonical=True,
https=True,
)
.exists()
):
log.debug("Proxito Public Domain -> Canonical Domain Redirect.", host=host)
request.canonicalize = constants.REDIRECT_CANONICAL_CNAME
elif ProjectRelationship.objects.filter(child__slug=project_slug).exists():
log.debug(
"Proxito Public Domain -> Subproject Main Domain Redirect.", host=host
)
request.canonicalize = constants.REDIRECT_SUBPROJECT_MAIN_DOMAIN
return project_slug


class ProxitoMiddleware(MiddlewareMixin):
Expand Down