diff --git a/.circleci/config.yml b/.circleci/config.yml index 25a50f29076..ea681b49c8d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -17,6 +17,16 @@ jobs: - run: pip install --user tox - run: tox -e py36,codecov + tests-embedapi: + docker: + - image: 'cimg/python:3.6' + steps: + - checkout + - run: git submodule sync + - run: git submodule update --init + - run: pip install --user tox + - run: tox -c tox.embedapi.ini + checks: docker: - image: 'cimg/python:3.6' @@ -45,3 +55,4 @@ workflows: jobs: - checks - tests + - tests-embedapi diff --git a/pytest.ini b/pytest.ini index 15e67f8a2d3..d06623bde72 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,9 +1,11 @@ [pytest] -addopts = --reuse-db --strict-markers +addopts = --strict-markers markers = search serve proxito + embed_api + sphinx python_files = tests.py test_*.py *_tests.py filterwarnings = # Ignore external dependencies warning deprecations @@ -13,3 +15,9 @@ filterwarnings = ignore:Pagination may yield inconsistent results with an unordered object_list.*:django.core.paginator.UnorderedObjectListWarning # docutils ignore:'U' mode is deprecated:DeprecationWarning + # slumber + ignore:Using 'method_whitelist' with Retry is deprecated and will be removed in v2.0.*:DeprecationWarning + # kombu + ignore:SelectableGroups dict interface is deprecated.*:DeprecationWarning + # django + ignore:Remove the context parameter from JSONField.*:django.utils.deprecation.RemovedInDjango30Warning \ No newline at end of file diff --git a/readthedocs/conftest.py b/readthedocs/conftest.py index 0dc0b840141..de296516abf 100644 --- a/readthedocs/conftest.py +++ b/readthedocs/conftest.py @@ -1,6 +1,12 @@ import pytest from rest_framework.test import APIClient + +pytest_plugins = ( + 'sphinx.testing.fixtures', +) + + @pytest.fixture def api_client(): return APIClient() diff --git a/readthedocs/embed/tests/test_links.py b/readthedocs/embed/tests/test_links.py index 8650a361282..aab2f81b230 100644 --- a/readthedocs/embed/tests/test_links.py +++ b/readthedocs/embed/tests/test_links.py @@ -3,7 +3,7 @@ import pytest from pyquery import PyQuery -from readthedocs.embed.views import clean_links +from readthedocs.embed.utils import clean_links URLData = namedtuple('URLData', ['docurl', 'href', 'expected']) diff --git a/readthedocs/embed/utils.py b/readthedocs/embed/utils.py index 95f8640749f..94ad78bfe59 100644 --- a/readthedocs/embed/utils.py +++ b/readthedocs/embed/utils.py @@ -1,5 +1,8 @@ """Embed utils.""" +from urllib.parse import urlparse +from pyquery import PyQuery as PQ # noqa + def recurse_while_none(element): """Recursively find the leaf node with the ``href`` attribute.""" @@ -10,3 +13,55 @@ def recurse_while_none(element): if not href: href = element.attrib.get('id') return {element.text: href} + + +def clean_links(obj, url, html_raw_response=False): + """ + Rewrite (internal) links to make them absolute. + + 1. external links are not changed + 2. prepend URL to links that are just fragments (e.g. #section) + 3. prepend URL (without filename) to internal relative links + """ + + # TODO: do not depend on PyQuery + obj = PQ(obj) + + if url is None: + return obj + + for link in obj.find('a'): + base_url = urlparse(url) + # We need to make all internal links, to be absolute + href = link.attrib['href'] + parsed_href = urlparse(href) + if parsed_href.scheme or parsed_href.path.startswith('/'): + # don't change external links + continue + + if not parsed_href.path and parsed_href.fragment: + # href="#section-link" + new_href = base_url.geturl() + href + link.attrib['href'] = new_href + continue + + if not base_url.path.endswith('/'): + # internal relative link + # href="../../another.html" and ``base_url`` is not HTMLDir + # (e.g. /en/latest/deep/internal/section/page.html) + # we want to remove the trailing filename (page.html) and use the rest as base URL + # The resulting absolute link should be + # https://slug.readthedocs.io/en/latest/deep/internal/section/../../another.html + + # remove the filename (page.html) from the original document URL (base_url) and, + path, _ = base_url.path.rsplit('/', 1) + # append the value of href (../../another.html) to the base URL. + base_url = base_url._replace(path=path + '/') + + new_href = base_url.geturl() + href + link.attrib['href'] = new_href + + if html_raw_response: + return obj.outerHtml() + + return obj diff --git a/readthedocs/embed/v3/__init__.py b/readthedocs/embed/v3/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/readthedocs/embed/v3/tests/__init__.py b/readthedocs/embed/v3/tests/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/readthedocs/embed/v3/tests/conftest.py b/readthedocs/embed/v3/tests/conftest.py new file mode 100644 index 00000000000..0efc9948632 --- /dev/null +++ b/readthedocs/embed/v3/tests/conftest.py @@ -0,0 +1,14 @@ +import os +import shutil +import pytest + +from .utils import srcdir + + +@pytest.fixture(autouse=True, scope='module') +def remove_sphinx_build_output(): + """Remove _build/ folder, if exist.""" + for path in (srcdir,): + build_path = os.path.join(path, '_build') + if os.path.exists(build_path): + shutil.rmtree(build_path) diff --git a/readthedocs/embed/v3/tests/examples/default/bibtex-cite.rst b/readthedocs/embed/v3/tests/examples/default/bibtex-cite.rst new file mode 100644 index 00000000000..bac1deac36c --- /dev/null +++ b/readthedocs/embed/v3/tests/examples/default/bibtex-cite.rst @@ -0,0 +1,9 @@ +sphinxcontrib-bibtex +==================== + +See https://sphinxcontrib-bibtex.readthedocs.io/en/latest/ for more information about how to use ``sphinxcontrib-bibtex``. + +See :cite:t:`1987:nelson` for an introduction to non-standard analysis. +Non-standard analysis is fun :cite:p:`1987:nelson`. + +.. bibliography:: diff --git a/readthedocs/embed/v3/tests/examples/default/chapter-i.rst b/readthedocs/embed/v3/tests/examples/default/chapter-i.rst new file mode 100644 index 00000000000..6bf55dad0f6 --- /dev/null +++ b/readthedocs/embed/v3/tests/examples/default/chapter-i.rst @@ -0,0 +1,11 @@ +:orphan: + +Chapter I +========= + +This is Chapter I. + +Section I +--------- + +This the Section I inside Chapter I. diff --git a/readthedocs/embed/v3/tests/examples/default/conf.py b/readthedocs/embed/v3/tests/examples/default/conf.py new file mode 100644 index 00000000000..b8fe3483942 --- /dev/null +++ b/readthedocs/embed/v3/tests/examples/default/conf.py @@ -0,0 +1,17 @@ +# conf.py to run tests +import sphinxcontrib.bibtex + +master_doc = 'index' +extensions = [ + 'sphinx.ext.autosectionlabel', + 'sphinxcontrib.bibtex', +] + +bibtex_bibfiles = ['refs.bib'] + +def setup(app): + app.add_object_type( + 'confval', # directivename + 'confval', # rolename + 'pair: %s; configuration value', # indextemplate + ) diff --git a/readthedocs/embed/v3/tests/examples/default/configuration.rst b/readthedocs/embed/v3/tests/examples/default/configuration.rst new file mode 100644 index 00000000000..7ac6465b9f0 --- /dev/null +++ b/readthedocs/embed/v3/tests/examples/default/configuration.rst @@ -0,0 +1,12 @@ +Configuration +============= + +Examples of configurations. + +.. confval:: config1 + + Description: This the description for config1 + + Default: ``'Default value for config'`` + + Type: bool diff --git a/readthedocs/embed/v3/tests/examples/default/glossary.rst b/readthedocs/embed/v3/tests/examples/default/glossary.rst new file mode 100644 index 00000000000..f8f50705e4d --- /dev/null +++ b/readthedocs/embed/v3/tests/examples/default/glossary.rst @@ -0,0 +1,9 @@ +Glossary +-------- + +Example using a ``:term:`` role :term:`Read the Docs`. + +.. glossary:: + + Read the Docs + Best company ever. diff --git a/readthedocs/embed/v3/tests/examples/default/index.rst b/readthedocs/embed/v3/tests/examples/default/index.rst new file mode 100644 index 00000000000..540bed0984c --- /dev/null +++ b/readthedocs/embed/v3/tests/examples/default/index.rst @@ -0,0 +1,9 @@ +Title +===== + +This is an example page used to test EmbedAPI parsing features. + +Sub-title +--------- + +This is a reference to :ref:`sub-title`. diff --git a/readthedocs/embed/v3/tests/examples/default/refs.bib b/readthedocs/embed/v3/tests/examples/default/refs.bib new file mode 100644 index 00000000000..8be9d662d21 --- /dev/null +++ b/readthedocs/embed/v3/tests/examples/default/refs.bib @@ -0,0 +1,6 @@ +@Book{1987:nelson, + author = {Edward Nelson}, + title = {Radically Elementary Probability Theory}, + publisher = {Princeton University Press}, + year = {1987} +} diff --git a/readthedocs/embed/v3/tests/test_basics.py b/readthedocs/embed/v3/tests/test_basics.py new file mode 100644 index 00000000000..56bf7e2b965 --- /dev/null +++ b/readthedocs/embed/v3/tests/test_basics.py @@ -0,0 +1,71 @@ +import pytest + +from django.conf import settings +from django.core.cache import cache +from django.urls import reverse + +from .utils import srcdir + + +@pytest.mark.django_db +@pytest.mark.embed_api +class TestEmbedAPIv3Basics: + + @pytest.fixture(autouse=True) + def setup_method(self, settings): + settings.USE_SUBDOMAIN = True + settings.PUBLIC_DOMAIN = 'readthedocs.io' + settings.RTD_EMBED_API_EXTERNAL_DOMAINS = ['docs.project.com'] + + self.api_url = reverse('embed_api_v3') + + yield + cache.clear() + + def test_not_url_query_argument(self, client): + params = {} + response = client.get(self.api_url, params) + assert response.status_code == 400 + assert response.json() == {'error': 'Invalid arguments. Please provide "url".'} + + def test_not_allowed_domain(self, client): + params = { + 'url': 'https://docs.notalloweddomain.com#title', + } + response = client.get(self.api_url, params) + assert response.status_code == 400 + assert response.json() == {'error': 'External domain not allowed. domain=docs.notalloweddomain.com'} + + def test_malformed_url(self, client): + params = { + 'url': 'https:///page.html#title', + } + response = client.get(self.api_url, params) + assert response.status_code == 400 + assert response.json() == {'error': f'The URL requested is malformed. url={params["url"]}'} + + def test_rate_limit_domain(self, client): + params = { + 'url': 'https://docs.project.com#title', + } + cache_key = 'embed-api-docs.project.com' + cache.set(cache_key, settings.RTD_EMBED_API_DOMAIN_RATE_LIMIT) + + response = client.get(self.api_url, params) + assert response.status_code == 429 + assert response.json() == {'error': 'Too many requests for this domain. domain=docs.project.com'} + + def test_infinite_redirect(self, client, requests_mock): + requests_mock.get( + 'https://docs.project.com', + status_code=302, + headers={ + 'Location': 'https://docs.project.com', + }, + ) + params = { + 'url': 'https://docs.project.com#title', + } + response = client.get(self.api_url, params) + assert response.status_code == 400 + assert response.json() == {'error': f'The URL requested generates too many redirects. url={params["url"]}'} diff --git a/readthedocs/embed/v3/tests/test_external_pages.py b/readthedocs/embed/v3/tests/test_external_pages.py new file mode 100644 index 00000000000..e1377da5f4c --- /dev/null +++ b/readthedocs/embed/v3/tests/test_external_pages.py @@ -0,0 +1,254 @@ +import docutils +import os + +import pytest +import sphinx + +from packaging.version import Version + +from django.conf import settings +from django.core.cache import cache +from django.urls import reverse + +from .utils import srcdir + + +@pytest.mark.django_db +@pytest.mark.embed_api +class TestEmbedAPIv3ExternalPages: + + @pytest.fixture(autouse=True) + def setup_method(self, settings): + settings.USE_SUBDOMAIN = True + settings.PUBLIC_DOMAIN = 'readthedocs.io' + settings.RTD_EMBED_API_EXTERNAL_DOMAINS = ['docs.project.com'] + + self.api_url = reverse('embed_api_v3') + + yield + cache.clear() + + @pytest.mark.sphinx('html', srcdir=srcdir, freshenv=True) + def test_default_main_section(self, app, client, requests_mock): + app.build() + path = app.outdir / 'index.html' + assert path.exists() is True + content = open(path).read() + requests_mock.get('https://docs.project.com', text=content) + + params = { + 'url': 'https://docs.project.com', + } + response = client.get(self.api_url, params) + assert response.status_code == 200 + + # The output is different because docutils is outputting this, + # and we're not sanitizing it, but just passing it through. + if Version(docutils.__version__) >= Version('0.17'): + content = '
\n \n
\n

Title

\n

This is an example page used to test EmbedAPI parsing features.

\n
\n

Sub-title

\n

This is a reference to Sub-title.

\n
\n
\n\n\n
' + else: + content = '
\n \n
\n

Title

\n

This is an example page used to test EmbedAPI parsing features.

\n
\n

Sub-title

\n

This is a reference to Sub-title.

\n
\n
\n\n\n
' + + assert response.json() == { + 'url': 'https://docs.project.com', + 'fragment': None, + 'content': content, + 'external': True, + } + + @pytest.mark.sphinx('html', srcdir=srcdir, freshenv=True) + def test_specific_identifier(self, app, client, requests_mock): + app.build() + path = app.outdir / 'index.html' + assert path.exists() is True + content = open(path).read() + requests_mock.get('https://docs.project.com', text=content) + + params = { + 'url': 'https://docs.project.com#sub-title', + } + response = client.get(self.api_url, params) + assert response.status_code == 200 + + if Version(docutils.__version__) >= Version('0.17'): + content = '
\n

Sub-title

\n

This is a reference to Sub-title.

\n
' + else: + content = '
\n

Sub-title

\n

This is a reference to Sub-title.

\n
' + + assert response.json() == { + 'url': 'https://docs.project.com#sub-title', + 'fragment': 'sub-title', + 'content': content, + 'external': True, + } + + @pytest.mark.sphinx('html', srcdir=srcdir, freshenv=True) + def test_dl_identifier(self, app, client, requests_mock): + app.build() + path = app.outdir / 'configuration.html' + assert path.exists() is True + content = open(path).read() + requests_mock.get('https://docs.project.com/configuration.html', text=content) + + params = { + 'url': 'https://docs.project.com/configuration.html#confval-config1', + } + response = client.get(self.api_url, params) + assert response.status_code == 200 + + + if sphinx.version_info < (3, 5, 0): + content = '
\nconfig1
' + elif sphinx.version_info[:2] == (3, 5): + content = '
\nconfig1
' + else: + content = '
\nconfig1
' + + assert response.json() == { + 'url': 'https://docs.project.com/configuration.html#confval-config1', + 'fragment': 'confval-config1', + 'content': content, + 'external': True, + } + + @pytest.mark.sphinx('html', srcdir=srcdir, freshenv=True) + def test_dl_identifier_doctool_sphinx(self, app, client, requests_mock): + app.build() + path = app.outdir / 'configuration.html' + assert path.exists() is True + content = open(path).read() + requests_mock.get('https://docs.project.com/configuration.html', text=content) + + # Calling the API without doctool + params = { + 'url': 'https://docs.project.com/configuration.html#confval-config1', + } + response = client.get(self.api_url, params) + assert response.status_code == 200 + + if sphinx.version_info < (3, 5, 0): + content = '
\nconfig1
' + elif sphinx.version_info[:2] == (3, 5): + content = '
\nconfig1
' + else: + content = '
\nconfig1
' + + assert response.json() == { + 'url': 'https://docs.project.com/configuration.html#confval-config1', + 'fragment': 'confval-config1', + 'content': content, + 'external': True, + } + + # Calling the API with doctool + params = { + 'url': 'https://docs.project.com/configuration.html#confval-config1', + 'doctool': 'sphinx', + } + response = client.get(self.api_url, params) + assert response.status_code == 200 + + if sphinx.version_info < (3, 0, 0): # <3.0 + content = '
\n
\nconfig1
\n

Description: This the description for config1

\n

Default: \'Default value for config\'

\n

Type: bool

\n
' + elif sphinx.version_info[:2] == (3, 5): + content = '
\n
\nconfig1
\n

Description: This the description for config1

\n

Default: \'Default value for config\'

\n

Type: bool

\n
' + elif sphinx.version_info < (4, 0, 0): # >3.0,=!3.5.x,<4.0 + content = '
\n
\nconfig1
\n

Description: This the description for config1

\n

Default: \'Default value for config\'

\n

Type: bool

\n
' + else: # >=4.0 + content = '
\n
\nconfig1
\n

Description: This the description for config1

\n

Default: \'Default value for config\'

\n

Type: bool

\n
' + + assert response.json() == { + 'url': 'https://docs.project.com/configuration.html#confval-config1', + 'fragment': 'confval-config1', + 'content': content, + 'external': True, + } + + @pytest.mark.sphinx('html', srcdir=srcdir, freshenv=True) + def test_citation_identifier_doctool_sphinx(self, app, client, requests_mock): + app.build() + path = app.outdir / 'bibtex-cite.html' + assert path.exists() is True + content = open(path).read() + requests_mock.get('https://docs.project.com/bibtex-cite.html', text=content) + + # Calling the API without doctool + params = { + 'url': 'https://docs.project.com/bibtex-cite.html#id4', + } + response = client.get(self.api_url, params) + assert response.status_code == 200 + assert response.json() == { + 'url': 'https://docs.project.com/bibtex-cite.html#id4', + 'fragment': 'id4', + 'content': '
Nel87(1,2)
', + 'external': True, + } + + # Calling the API with doctool + params = { + 'url': 'https://docs.project.com/bibtex-cite.html#id4', + 'doctool': 'sphinx', + } + response = client.get(self.api_url, params) + assert response.status_code == 200 + assert response.json() == { + 'url': 'https://docs.project.com/bibtex-cite.html#id4', + 'fragment': 'id4', + 'content': '
\n
Nel87(1,2)
\n

Edward Nelson. Radically Elementary Probability Theory. Princeton University Press, 1987.

\n
\n
', + 'external': True, + } + + @pytest.mark.sphinx('html', srcdir=srcdir, freshenv=True) + def test_glossary_identifier_doctool_sphinx(self, app, client, requests_mock): + app.build() + path = app.outdir / 'glossary.html' + assert path.exists() is True + content = open(path).read() + requests_mock.get('https://docs.project.com/glossary.html', text=content) + + # Note there are differences on the case of the fragment + if sphinx.version_info >= (3, 0, 0): + fragment = 'term-Read-the-Docs' + else: + fragment = 'term-read-the-docs' + + # Calling the API without doctool + url = f'https://docs.project.com/glossary.html#{fragment}' + params = { + 'url': url, + } + response = client.get(self.api_url, params) + assert response.status_code == 200 + + if sphinx.version_info >= (3, 5, 0): + content = f'
Read the Docs
' + else: + content = f'
Read the Docs
' + + assert response.json() == { + 'url': url, + 'fragment': fragment, + 'content': content, + 'external': True, + } + + # Calling the API with doctool + params = { + 'url': url, + 'doctool': 'sphinx', + } + response = client.get(self.api_url, params) + assert response.status_code == 200 + + if sphinx.version_info >= (3, 5, 0): + content = f'
\n
Read the Docs

Best company ever.

\n
\n
' + else: + content = f'
\n
Read the Docs

Best company ever.

\n
\n
' + + assert response.json() == { + 'url': url, + 'content': content, + 'fragment': fragment, + 'external': True, + } diff --git a/readthedocs/embed/v3/tests/test_internal_pages.py b/readthedocs/embed/v3/tests/test_internal_pages.py new file mode 100644 index 00000000000..dbf47435568 --- /dev/null +++ b/readthedocs/embed/v3/tests/test_internal_pages.py @@ -0,0 +1,77 @@ +import docutils +import pytest + +from contextlib import contextmanager +from packaging.version import Version +from unittest import mock + +import django_dynamic_fixture as fixture + +from django.conf import settings +from django.core.cache import cache +from django.urls import reverse + +from readthedocs.projects.models import Project + +from .utils import srcdir + + +@pytest.mark.django_db +@pytest.mark.embed_api +class TestEmbedAPIv3InternalPages: + + @pytest.fixture(autouse=True) + def setup_method(self, settings): + settings.USE_SUBDOMAIN = True + settings.PUBLIC_DOMAIN = 'readthedocs.io' + settings.RTD_EMBED_API_EXTERNAL_DOMAINS = [] + + self.api_url = reverse('embed_api_v3') + + self.project = fixture.get( + Project, + slug='project' + ) + + yield + cache.clear() + + def _mock_open(self, content): + @contextmanager + def f(*args, **kwargs): + read_mock = mock.MagicMock() + read_mock.read.return_value = content + yield read_mock + return f + + def _patch_storage_open(self, storage_mock, content): + storage_mock.exists.return_value = True + storage_mock.open.side_effect = self._mock_open(content) + + @pytest.mark.sphinx('html', srcdir=srcdir, freshenv=True) + @mock.patch('readthedocs.embed.v3.views.build_media_storage') + def test_default_main_section(self, build_media_storage, app, client): + app.build() + path = app.outdir / 'index.html' + assert path.exists() is True + content = open(path).read() + self._patch_storage_open(build_media_storage, content) + + params = { + 'url': 'https://project.readthedocs.io/en/latest/', + } + response = client.get(self.api_url, params) + assert response.status_code == 200 + + # Note the difference between `
` and `
` + if Version(docutils.__version__) >= Version('0.17'): + content = '
\n \n
\n

Title

\n

This is an example page used to test EmbedAPI parsing features.

\n
\n

Sub-title

\n

This is a reference to Sub-title.

\n
\n
\n\n\n
' + else: + content = '
\n \n
\n

Title

\n

This is an example page used to test EmbedAPI parsing features.

\n
\n

Sub-title

\n

This is a reference to Sub-title.

\n
\n
\n\n\n
' + + assert response.json() == { + 'url': 'https://project.readthedocs.io/en/latest/', + 'fragment': None, + 'content': content, + 'external': False, + } diff --git a/readthedocs/embed/v3/tests/utils.py b/readthedocs/embed/v3/tests/utils.py new file mode 100644 index 00000000000..9dffe8f058f --- /dev/null +++ b/readthedocs/embed/v3/tests/utils.py @@ -0,0 +1,8 @@ +import os + + +srcdir = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + 'examples', + 'default', +) diff --git a/readthedocs/embed/v3/urls.py b/readthedocs/embed/v3/urls.py new file mode 100644 index 00000000000..5d7c51852d5 --- /dev/null +++ b/readthedocs/embed/v3/urls.py @@ -0,0 +1,8 @@ +from django.conf.urls import url + +from .views import EmbedAPI + + +urlpatterns = [ + url(r'', EmbedAPI.as_view(), name='embed_api_v3'), +] diff --git a/readthedocs/embed/v3/views.py b/readthedocs/embed/v3/views.py new file mode 100644 index 00000000000..c4256fa0237 --- /dev/null +++ b/readthedocs/embed/v3/views.py @@ -0,0 +1,345 @@ +"""Views for the EmbedAPI v3 app.""" + +import logging +import re +from urllib.parse import urlparse +import requests + +from selectolax.parser import HTMLParser +from pyquery import PyQuery as PQ # noqa + +from django.conf import settings +from django.core.cache import cache +from django.shortcuts import get_object_or_404 +from django.utils.functional import cached_property +from rest_framework import status +from rest_framework.permissions import AllowAny +from rest_framework.renderers import BrowsableAPIRenderer, JSONRenderer +from rest_framework.response import Response +from rest_framework.views import APIView + +from readthedocs.api.v2.mixins import CachedResponseMixin +from readthedocs.core.unresolver import unresolve +from readthedocs.core.utils.extend import SettingsOverrideObject +from readthedocs.embed.utils import clean_links +from readthedocs.projects.constants import PUBLIC +from readthedocs.storage import build_media_storage + +log = logging.getLogger(__name__) + + +class EmbedAPIBase(CachedResponseMixin, APIView): + + # pylint: disable=line-too-long + # pylint: disable=no-self-use + + """ + Embed a section of content from any Read the Docs page. + + ### Arguments + + * url (with fragment) (required) + * doctool + * doctoolversion + + ### Example + + GET https://readthedocs.org/api/v3/embed/?url=https://docs.readthedocs.io/en/latest/features.html%23#full-text-search + + """ # noqa + + permission_classes = [AllowAny] + renderer_classes = [JSONRenderer, BrowsableAPIRenderer] + + @cached_property + def unresolved_url(self): + url = self.request.GET.get('url') + if not url: + return None + return unresolve(url) + + def _download_page_content(self, url): + # Sanitize the URL before requesting it + url = urlparse(url)._replace(fragment='', query='').geturl() + + # TODO: sanitize the cache key just in case, maybe by hashing it + cache_key = f'embed-api-{url}' + cached_response = cache.get(cache_key) + if cached_response: + log.debug('Cached response. url=%s', url) + return cached_response + + response = requests.get(url, timeout=settings.RTD_EMBED_API_DEFAULT_REQUEST_TIMEOUT) + if response.ok: + cache.set( + cache_key, + response.text, + timeout=settings.RTD_EMBED_API_PAGE_CACHE_TIMEOUT, + ) + return response.text + + def _get_page_content_from_storage(self, project, version_slug, filename): + version = get_object_or_404( + project.versions, + slug=version_slug, + # Only allow PUBLIC versions when getting the content from our + # storage for privacy/security reasons + privacy_level=PUBLIC, + ) + storage_path = project.get_storage_path( + 'html', + version_slug=version.slug, + include_file=False, + version_type=version.type, + ) + file_path = build_media_storage.join( + storage_path, + filename, + ) + try: + with build_media_storage.open(file_path) as fd: # pylint: disable=invalid-name + return fd.read() + except Exception: # noqa + log.warning('Unable to read file. file_path=%s', file_path) + + return None + + def _get_content_by_fragment(self, url, fragment, external, doctool, doctoolversion): + if external: + page_content = self._download_page_content(url) + else: + project = self.unresolved_url.project + version_slug = self.unresolved_url.version_slug + filename = self.unresolved_url.filename + page_content = self._get_page_content_from_storage(project, version_slug, filename) + + return self._parse_based_on_doctool(page_content, fragment, doctool, doctoolversion) + + def _find_main_node(self, html): + main_node = html.css_first('[role=main]') + if main_node: + log.info('Main node found. selector=[role=main]') + return main_node + + main_node = html.css_first('main') + if main_node: + log.info('Main node found. selector=main') + return main_node + + first_header = html.body.css_first('h1') + if first_header: + log.info('Main node found. selector=h1') + return first_header.parent + + def _parse_based_on_doctool(self, page_content, fragment, doctool, doctoolversion): + # pylint: disable=unused-argument + if not page_content: + return + + node = None + if fragment: + selector = f'#{fragment}' + node = HTMLParser(page_content).css_first(selector) + else: + html = HTMLParser(page_content) + node = self._find_main_node(html) + + if not node: + return + + if doctool == 'sphinx': + # Handle ``dt`` special cases + if node.tag == 'dt': + if 'glossary' in node.parent.attributes.get('class'): + # Sphinx HTML structure for term glossary puts the ``id`` in the + # ``dt`` element with the title of the term. In this case, we + # return the parent node which contains the definition list + # and remove all ``dt/dd`` that are not the requested one + + # Structure: + #
+ #
definition
+ #
Text definition for the term
+ # ... + #
+ + # TODO: figure it out if it's needed to remove the siblings here + # parent = node.parent + # for n in parent.traverse(): + # if n not in (node, node.next): + # n.remove() + node = node.parent + + elif 'citation' in node.parent.attributes.get('class'): + # Sphinx HTML structure for sphinxcontrib-bibtex puts the ``id`` in the + # ``dt`` element with the title of the cite. In this case, we + # return the parent node which contains the definition list + # and remove all ``dt/dd`` that are not the requested one + + # Structure: + #
+ #
Title of the cite
+ #
Content of the cite
+ # ... + #
+ + # TODO: figure it out if it's needed to remove the siblings here + # parent = node.parent + # for n in parent.traverse(): + # if n not in (node, node.next): + # n.remove() + node = node.parent + + else: + # Sphinx HTML structure for definition list puts the ``id`` + # the ``dt`` element, instead of the ``dl``. This makes + # the backend to return just the title of the definition. If we + # detect this case, we return the parent with the whole ``dl`` tag + + # Structure: + #
+ #
+ # config + #
+ #

Text with a description

+ #
+ node = node.parent + + return node.html + + def get(self, request): # noqa + url = request.GET.get('url') + doctool = request.GET.get('doctool') + doctoolversion = request.GET.get('doctoolversion') + + if not url: + return Response( + { + 'error': ( + 'Invalid arguments. ' + 'Please provide "url".' + ) + }, + status=status.HTTP_400_BAD_REQUEST + ) + + parsed_url = urlparse(url) + domain = parsed_url.netloc + if not domain or not parsed_url.scheme: + return Response( + { + 'error': ( + 'The URL requested is malformed. ' + f'url={url}' + ) + }, + status=status.HTTP_400_BAD_REQUEST, + ) + + # NOTE: ``readthedocs.core.unresolver.unresolve`` returns ``None`` when + # it can't find the project in our database + external = self.unresolved_url is None + if external: + for allowed_domain in settings.RTD_EMBED_API_EXTERNAL_DOMAINS: + if re.match(allowed_domain, domain): + break + else: + log.info('Domain not allowed. domain=%s url=%s', domain, url) + return Response( + { + 'error': ( + 'External domain not allowed. ' + f'domain={domain}' + ) + }, + status=status.HTTP_400_BAD_REQUEST, + ) + + # Check rate-limit for this particular domain + cache_key = f'embed-api-{domain}' + cache.get_or_set(cache_key, 0, timeout=settings.RTD_EMBED_API_DOMAIN_RATE_LIMIT_TIMEOUT) + cache.incr(cache_key) + if cache.get(cache_key) > settings.RTD_EMBED_API_DOMAIN_RATE_LIMIT: + log.warning('Too many requests for this domain. domain=%s', domain) + return Response( + { + 'error': ( + 'Too many requests for this domain. ' + f'domain={domain}' + ) + }, + status=status.HTTP_429_TOO_MANY_REQUESTS, + ) + + # NOTE: we could validate the fragment if we want. It must contain at + # least one character, cannot start with a number, and must not contain + # whitespaces (spaces, tabs, etc.). + fragment = parsed_url.fragment + + try: + content_requested = self._get_content_by_fragment( + url, + fragment, + external, + doctool, + doctoolversion, + ) + except requests.exceptions.TooManyRedirects: + log.exception('Too many redirects. url=%s', url) + return Response( + { + 'error': ( + 'The URL requested generates too many redirects. ' + f'url={url}' + ) + }, + # TODO: review these status codes to find out which on is better here + # 400 Bad Request + # 502 Bad Gateway + # 503 Service Unavailable + status=status.HTTP_400_BAD_REQUEST, + ) + except Exception: # noqa + log.exception('There was an error reading the URL requested. url=%s', url) + return Response( + { + 'error': ( + 'There was an error reading the URL requested. ' + f'url={url}' + ) + }, + status=status.HTTP_400_BAD_REQUEST, + ) + + if not content_requested: + log.warning('Identifier not found. url=%s fragment=%s', url, fragment) + return Response( + { + 'error': ( + "Can't find content for section: " + f"url={url} fragment={fragment}" + ) + }, + status=status.HTTP_404_NOT_FOUND + ) + + # Sanitize the URL before requesting it + sanitized_url = urlparse(url)._replace(fragment='', query='').geturl() + # Make links from the content to be absolute + content = clean_links( + content_requested, + sanitized_url, + html_raw_response=True, + ) + + response = { + 'url': url, + 'fragment': fragment if fragment else None, + 'content': content, + 'external': external, + } + return Response(response) + + +class EmbedAPI(SettingsOverrideObject): + _default_class = EmbedAPIBase diff --git a/readthedocs/embed/views.py b/readthedocs/embed/views.py index c63ac207a95..7237d4a012e 100644 --- a/readthedocs/embed/views.py +++ b/readthedocs/embed/views.py @@ -4,7 +4,6 @@ import json import logging import re -from urllib.parse import urlparse from django.shortcuts import get_object_or_404 from django.template.defaultfilters import slugify @@ -22,7 +21,7 @@ from readthedocs.core.resolver import resolve from readthedocs.core.unresolver import unresolve from readthedocs.core.utils.extend import SettingsOverrideObject -from readthedocs.embed.utils import recurse_while_none +from readthedocs.embed.utils import recurse_while_none, clean_links from readthedocs.projects.models import Project from readthedocs.storage import build_media_storage @@ -36,51 +35,6 @@ def escape_selector(selector): return ret -def clean_links(obj, url): - """ - Rewrite (internal) links to make them absolute. - - 1. external links are not changed - 2. prepend URL to links that are just fragments (e.g. #section) - 3. prepend URL (without filename) to internal relative links - """ - if url is None: - return obj - - for link in obj.find('a'): - base_url = urlparse(url) - # We need to make all internal links, to be absolute - href = link.attrib['href'] - parsed_href = urlparse(href) - if parsed_href.scheme or parsed_href.path.startswith('/'): - # don't change external links - continue - - if not parsed_href.path and parsed_href.fragment: - # href="#section-link" - new_href = base_url.geturl() + href - link.attrib['href'] = new_href - continue - - if not base_url.path.endswith('/'): - # internal relative link - # href="../../another.html" and ``base_url`` is not HTMLDir - # (e.g. /en/latest/deep/internal/section/page.html) - # we want to remove the trailing filename (page.html) and use the rest as base URL - # The resulting absolute link should be - # https://slug.readthedocs.io/en/latest/deep/internal/section/../../another.html - - # remove the filename (page.html) from the original document URL (base_url) and, - path, _ = base_url.path.rsplit('/', 1) - # append the value of href (../../another.html) to the base URL. - base_url = base_url._replace(path=path + '/') - - new_href = base_url.geturl() + href - link.attrib['href'] = new_href - - return obj - - class EmbedAPIBase(CachedResponseMixin, APIView): # pylint: disable=line-too-long @@ -371,7 +325,7 @@ def dump(obj): return obj.outerHtml() ret = [ - dump(clean_links(PQ(obj), url)) + dump(clean_links(obj, url)) for obj in query_result ] return ret, headers, section diff --git a/readthedocs/settings/base.py b/readthedocs/settings/base.py index 85cf6f2cbb4..ea7cae48efe 100644 --- a/readthedocs/settings/base.py +++ b/readthedocs/settings/base.py @@ -777,3 +777,13 @@ def DOCKER_LIMITS(self): }, }, } + + RTD_EMBED_API_EXTERNAL_DOMAINS = [ + r'docs\.python\.org', + r'docs\.scipy\.org', + r'docs\.sympy\.org', + ] + RTD_EMBED_API_PAGE_CACHE_TIMEOUT = 5 * 10 + RTD_EMBED_API_DEFAULT_REQUEST_TIMEOUT = 1 + RTD_EMBED_API_DOMAIN_RATE_LIMIT = 50 + RTD_EMBED_API_DOMAIN_RATE_LIMIT_TIMEOUT = 60 diff --git a/readthedocs/settings/docker_compose.py b/readthedocs/settings/docker_compose.py index 4f598a91b7f..496c5bf7439 100644 --- a/readthedocs/settings/docker_compose.py +++ b/readthedocs/settings/docker_compose.py @@ -63,6 +63,12 @@ def RTD_EXT_THEME_DEV_SERVER(self): RTD_CLEAN_AFTER_BUILD = True + @property + def RTD_EMBED_API_EXTERNAL_DOMAINS(self): + domains = super().RTD_EMBED_API_EXTERNAL_DOMAINS + domains.append(r'.*\.readthedocs\.io') + return domains + @property def LOGGING(self): logging = super().LOGGING diff --git a/readthedocs/urls.py b/readthedocs/urls.py index 37f791e79cb..51ba6bb8b02 100644 --- a/readthedocs/urls.py +++ b/readthedocs/urls.py @@ -66,6 +66,7 @@ include('rest_framework.urls', namespace='rest_framework') ), url(r'^api/v3/', include('readthedocs.api.v3.urls')), + url(r'^api/v3/embed/', include('readthedocs.embed.v3.urls')), ] i18n_urls = [ diff --git a/tox.embedapi.ini b/tox.embedapi.ini new file mode 100644 index 00000000000..baa1ae8f814 --- /dev/null +++ b/tox.embedapi.ini @@ -0,0 +1,34 @@ +[tox] +envlist = sphinx-{18,20,21,22,23,24,30,31,32,33,34,35,40,41,latest} + +[testenv] +description = run test suite for the EmbedAPIv3 +install_command = + # Install requirements in multiple steps because we don't want to install + # Sphinx from `requirements/pip.txt` but from the `deps=` field. + /bin/sh -c ' \ + cat {toxinidir}/requirements/pip.txt | grep -v "Sphinx" > {toxinidir}/requirements/embedapi.txt; \ + sed {toxinidir}/requirements/testing.txt -e "s|pip.txt|embedapi.txt|g" > {toxinidir}/requirements/testing.embedapi.txt; \ + pip install -r {toxinidir}/requirements/testing.embedapi.txt; \ + pip install sphinxcontrib-bibtex; \ + pip install $*;' -- {opts} {packages} +deps = + sphinx-18: Sphinx~=1.8.0 + sphinx-20: Sphinx~=2.0.0 + sphinx-21: Sphinx~=2.1.0 + sphinx-22: Sphinx~=2.2.0 + sphinx-23: Sphinx~=2.3.0 + sphinx-24: Sphinx~=2.4.0 + sphinx-30: Sphinx~=3.0.0 + sphinx-31: Sphinx~=3.1.0 + sphinx-32: Sphinx~=3.2.0 + sphinx-33: Sphinx~=3.3.0 + sphinx-34: Sphinx~=3.4.0 + sphinx-35: Sphinx~=3.5.0 + sphinx-40: Sphinx~=4.0.0 + sphinx-41: Sphinx~=4.1.0 + sphinx-latest: Sphinx +setenv = + DJANGO_SETTINGS_MODULE=readthedocs.settings.test +changedir = {toxinidir}/readthedocs +commands = pytest -m embed_api {posargs} \ No newline at end of file diff --git a/tox.ini b/tox.ini index 8752e5cc8c7..494a54c2e34 100644 --- a/tox.ini +++ b/tox.ini @@ -19,13 +19,12 @@ basepython = commands = /bin/sh -c '\ export DJANGO_SETTINGS_MODULE=readthedocs.settings.test; \ - pytest --cov-report= --cov-config {toxinidir}/.coveragerc --cov=. --suppress-no-test-exit-code -m "not proxito" {posargs:{env:TOX_POSARGS:-m "not search and not proxito"}}' + pytest --cov-report= --cov-config {toxinidir}/.coveragerc --cov=. --suppress-no-test-exit-code -m "not proxito and not embed_api" {posargs:{env:TOX_POSARGS:-m "not search and not proxito and not embed_api"}}' /bin/sh -c '\ export DJANGO_SETTINGS_MODULE=readthedocs.settings.proxito.test; \ pytest --cov-report= --cov-config {toxinidir}/.coveragerc --cov=. --cov-append -m proxito --suppress-no-test-exit-code {posargs}' - [testenv:docs] description = build readthedocs documentation changedir = {toxinidir}/docs