Skip to content

Search indexing with storage #5854

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Aug 8, 2019
23 changes: 20 additions & 3 deletions readthedocs/builds/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,10 @@ def delete_directory(self, path):
for folder_name in folders:
if folder_name:
# Recursively delete the subdirectory
self.delete_directory(safe_join(path, folder_name))
self.delete_directory(self.join(path, folder_name))
for filename in files:
if filename:
self.delete(safe_join(path, filename))
self.delete(self.join(path, filename))

def copy_directory(self, source, destination):
"""
Expand All @@ -79,14 +79,31 @@ def copy_directory(self, source, destination):
log.debug('Copying source directory %s to media storage at %s', source, destination)
source = Path(source)
for filepath in source.iterdir():
sub_destination = safe_join(destination, filepath.name)
sub_destination = self.join(destination, filepath.name)
if filepath.is_dir():
# Recursively copy the subdirectory
self.copy_directory(filepath, sub_destination)
elif filepath.is_file():
with filepath.open('rb') as fd:
self.save(sub_destination, fd)

def join(self, directory, filepath):
return safe_join(directory, filepath)

def walk(self, top):
if top in ('', '/'):
raise SuspiciousFileOperation('Iterating all storage cannot be right')

log.debug('Walking %s in media storage', top)
folders, files = self.listdir(self._dirpath(top))

yield top, folders, files

for folder_name in folders:
if folder_name:
# Recursively walk the subdirectory
yield from self.walk(self.join(top, folder_name))


class BuildMediaFileSystemStorage(BuildMediaStorageMixin, FileSystemStorage):

Expand Down
46 changes: 28 additions & 18 deletions readthedocs/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1215,26 +1215,36 @@ def get_processed_json(self):
Both lead to `foo/index.html`
https://github.com/rtfd/readthedocs.org/issues/5368
"""
fjson_paths = []
basename = os.path.splitext(self.path)[0]
fjson_paths.append(basename + '.fjson')
if basename.endswith('/index'):
new_basename = re.sub(r'\/index$', '', basename)
fjson_paths.append(new_basename + '.fjson')

full_json_path = self.project.get_production_media_path(
type_='json', version_slug=self.version.slug, include_file=False
)
try:
for fjson_path in fjson_paths:
file_path = os.path.join(full_json_path, fjson_path)
if os.path.exists(file_path):
return process_file(file_path)
except Exception:
file_path = None

if settings.RTD_BUILD_MEDIA_STORAGE:
storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()

fjson_paths = []
basename = os.path.splitext(self.path)[0]
fjson_paths.append(basename + '.fjson')
if basename.endswith('/index'):
new_basename = re.sub(r'\/index$', '', basename)
fjson_paths.append(new_basename + '.fjson')

storage_path = self.project.get_storage_path(
type_='json', version_slug=self.version.slug, include_file=False
)
try:
for fjson_path in fjson_paths:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe we should be able to get away from this soon. We should be starting to store the proper path of a file after readthedocs/readthedocs-sphinx-ext#62 is merged.

file_path = storage.join(storage_path, fjson_path)
if storage.exists(file_path):
return process_file(file_path)
except Exception:
log.warning(
'Unhandled exception during search processing file: %s',
file_path,
)
else:
log.warning(
'Unhandled exception during search processing file: %s',
file_path,
'Skipping HTMLFile processing because of no storage backend'
)

return {
'headers': [],
'content': '',
Expand Down
65 changes: 44 additions & 21 deletions readthedocs/projects/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1268,7 +1268,6 @@ def fileify(version_pk, commit, build):
)
return

path = project.rtd_build_path(version.slug)
log.info(
LOG_TEMPLATE,
{
Expand All @@ -1278,40 +1277,47 @@ def fileify(version_pk, commit, build):
}
)
try:
_manage_imported_files(version, path, commit, build)
_manage_imported_files(version, commit, build)
except Exception:
log.exception('Failed during ImportedFile creation')

try:
_update_intersphinx_data(version, path, commit, build)
_update_intersphinx_data(version, commit, build)
except Exception:
log.exception('Failed during SphinxDomain creation')


def _update_intersphinx_data(version, path, commit, build):
def _update_intersphinx_data(version, commit, build):
"""
Update intersphinx data for this version.

:param version: Version instance
:param path: Path to search
:param commit: Commit that updated path
:param build: Build id
"""

object_file = os.path.join(path, 'objects.inv')
if not os.path.exists(object_file):
log.debug('No objects.inv, skipping intersphinx indexing.')
if not settings.RTD_BUILD_MEDIA_STORAGE:
return

full_json_path = version.project.get_production_media_path(
storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()

html_storage_path = version.project.get_storage_path(
type_='html', version_slug=version.slug, include_file=False
)
json_storage_path = version.project.get_storage_path(
type_='json', version_slug=version.slug, include_file=False
)
type_file = os.path.join(full_json_path, 'readthedocs-sphinx-domain-names.json')

object_file = storage.join(html_storage_path, 'objects.inv')
if not storage.exists(object_file):
log.debug('No objects.inv, skipping intersphinx indexing.')
return

type_file = storage.join(json_storage_path, 'readthedocs-sphinx-domain-names.json')
types = {}
titles = {}
if os.path.exists(type_file):
if storage.exists(type_file):
try:
data = json.load(open(type_file))
data = json.load(storage.open(type_file))
types = data['types']
titles = data['titles']
except Exception:
Expand All @@ -1331,7 +1337,13 @@ def warn(self, msg):
log.warning('Sphinx MockApp: %s', msg)

# Re-create all objects from the new build of the version
invdata = intersphinx.fetch_inventory(MockApp(), '', object_file)
object_file_url = storage.url(object_file)
if object_file_url.startswith('/'):
# Filesystem backed storage simply prepends MEDIA_URL to the path to get the URL
# This can cause an issue if MEDIA_URL is not fully qualified
object_file_url = 'http://' + settings.PRODUCTION_DOMAIN + object_file_url

invdata = intersphinx.fetch_inventory(MockApp(), '', object_file_url)
for key, value in sorted(invdata.items() or {}):
domain, _type = key.split(':')
for name, einfo in sorted(value.items()):
Expand Down Expand Up @@ -1424,29 +1436,40 @@ def clean_build(version_pk):
return True


def _manage_imported_files(version, path, commit, build):
def _manage_imported_files(version, commit, build):
"""
Update imported files for version.

:param version: Version instance
:param path: Path to search
:param commit: Commit that updated path
:param build: Build id
"""

if not settings.RTD_BUILD_MEDIA_STORAGE:
return
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should likely log something here, so we don't get confused if indexing isn't working.


storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()

changed_files = set()

# Re-create all objects from the new build of the version
for root, __, filenames in os.walk(path):
storage_path = version.project.get_storage_path(
type_='html', version_slug=version.slug, include_file=False
)
for root, __, filenames in storage.walk(storage_path):
for filename in filenames:
if filename.endswith('.html'):
model_class = HTMLFile
else:
elif version.project.cdn_enabled:
# We need to track all files for CDN enabled projects so the files can be purged
model_class = ImportedFile
else:
# For projects not behind a CDN, we don't care about non-HTML
continue

full_path = os.path.join(root, filename)
relpath = os.path.relpath(full_path, path)
relpath = storage.join(root, filename)
try:
md5 = hashlib.md5(open(full_path, 'rb').read()).hexdigest()
md5 = hashlib.md5(storage.open(relpath, 'rb').read()).hexdigest()
except Exception:
log.exception(
'Error while generating md5 for %s:%s:%s. Don\'t stop.',
Expand Down
52 changes: 52 additions & 0 deletions readthedocs/rtd_tests/tests/test_build_storage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import os
import shutil
import tempfile

from django.test import TestCase

from readthedocs.builds.storage import BuildMediaFileSystemStorage


files_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'files')


class TestBuildMediaStorage(TestCase):
def setUp(self):
self.test_media_dir = tempfile.mkdtemp()
self.storage = BuildMediaFileSystemStorage(location=self.test_media_dir)

def tearDown(self):
shutil.rmtree(self.test_media_dir, ignore_errors=True)

def test_copy_directory(self):
self.assertFalse(self.storage.exists('files/test.html'))

self.storage.copy_directory(files_dir, 'files')
self.assertTrue(self.storage.exists('files/test.html'))
self.assertTrue(self.storage.exists('files/conf.py'))
self.assertTrue(self.storage.exists('files/api.fjson'))

def test_delete_directory(self):
self.storage.copy_directory(files_dir, 'files')
dirs, files = self.storage.listdir('files')
self.assertEqual(dirs, [])
self.assertEqual(files, ['api.fjson', 'conf.py', 'test.html'])

self.storage.delete_directory('files/')
dirs, files = self.storage.listdir('files')
self.assertEqual(dirs, [])
self.assertEqual(files, [])

def test_walk(self):
self.storage.copy_directory(files_dir, 'files')
self.storage.copy_directory(files_dir, 'files/subfiles')

output = list(self.storage.walk('files'))
self.assertEqual(len(output), 2)
self.assertEqual(
output,
[
('files', ['subfiles'], ['api.fjson', 'conf.py', 'test.html']),
('files/subfiles', [], ['api.fjson', 'conf.py', 'test.html']),
],
)
47 changes: 37 additions & 10 deletions readthedocs/rtd_tests/tests/test_imported_file.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# -*- coding: utf-8 -*-
import os

from django.conf import settings
from django.core.files.storage import get_storage_class
from django.test import TestCase

from readthedocs.projects.models import ImportedFile, Project
Expand All @@ -13,24 +15,45 @@
class ImportedFileTests(TestCase):
fixtures = ['eric', 'test_data']

storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()

def setUp(self):
self.project = Project.objects.get(slug='pip')
self.version = self.project.versions.first()

self.test_dir = os.path.join(base_dir, 'files')
self._copy_storage_dir()

def _copy_storage_dir(self):
self.storage.copy_directory(
self.test_dir,
self.project.get_storage_path(
type_='html',
version_slug=self.version.slug,
include_file=False,
),
)

def test_properly_created(self):
test_dir = os.path.join(base_dir, 'files')
# Only one file in the directory is HTML
self.assertEqual(ImportedFile.objects.count(), 0)
_manage_imported_files(self.version, test_dir, 'commit01', 1)
self.assertEqual(ImportedFile.objects.count(), 3)
_manage_imported_files(self.version, test_dir, 'commit01', 2)
_manage_imported_files(self.version, 'commit01', 1)
self.assertEqual(ImportedFile.objects.count(), 1)
_manage_imported_files(self.version, 'commit01', 2)
self.assertEqual(ImportedFile.objects.count(), 1)

self.project.cdn_enabled = True
self.project.save()

# CDN enabled projects => save all files
_manage_imported_files(self.version, 'commit01', 3)
self.assertEqual(ImportedFile.objects.count(), 3)

def test_update_commit(self):
test_dir = os.path.join(base_dir, 'files')
self.assertEqual(ImportedFile.objects.count(), 0)
_manage_imported_files(self.version, test_dir, 'commit01', 1)
_manage_imported_files(self.version, 'commit01', 1)
self.assertEqual(ImportedFile.objects.first().commit, 'commit01')
_manage_imported_files(self.version, test_dir, 'commit02', 2)
_manage_imported_files(self.version, 'commit02', 2)
self.assertEqual(ImportedFile.objects.first().commit, 'commit02')

def test_update_content(self):
Expand All @@ -40,13 +63,17 @@ def test_update_content(self):
with open(os.path.join(test_dir, 'test.html'), 'w+') as f:
f.write('Woo')

_manage_imported_files(self.version, test_dir, 'commit01', 1)
self._copy_storage_dir()

_manage_imported_files(self.version, 'commit01', 1)
self.assertEqual(ImportedFile.objects.get(name='test.html').md5, 'c7532f22a052d716f7b2310fb52ad981')

with open(os.path.join(test_dir, 'test.html'), 'w+') as f:
f.write('Something Else')

_manage_imported_files(self.version, test_dir, 'commit02', 2)
self._copy_storage_dir()

_manage_imported_files(self.version, 'commit02', 2)
self.assertNotEqual(ImportedFile.objects.get(name='test.html').md5, 'c7532f22a052d716f7b2310fb52ad981')

self.assertEqual(ImportedFile.objects.count(), 3)
self.assertEqual(ImportedFile.objects.count(), 1)
11 changes: 5 additions & 6 deletions readthedocs/rtd_tests/tests/test_search_json_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,20 @@
import os

from django.test import TestCase
from django.test.utils import override_settings

from readthedocs.search.parse_json import process_file


base_dir = os.path.dirname(os.path.dirname(__file__))


class TestHacks(TestCase):

@override_settings(MEDIA_ROOT=base_dir)
def test_h2_parsing(self):
data = process_file(
os.path.join(
base_dir,
'files/api.fjson',
),
)
data = process_file('files/api.fjson')

self.assertEqual(data['sections'][1]['id'], 'a-basic-api-client-using-slumber')
# Only capture h2's after the first section
for obj in data['sections'][1:]:
Expand Down
Loading