Skip to content

Search indexing with storage #5854

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Aug 8, 2019
23 changes: 20 additions & 3 deletions readthedocs/builds/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,10 @@ def delete_directory(self, path):
for folder_name in folders:
if folder_name:
# Recursively delete the subdirectory
self.delete_directory(safe_join(path, folder_name))
self.delete_directory(self.join(path, folder_name))
for filename in files:
if filename:
self.delete(safe_join(path, filename))
self.delete(self.join(path, filename))

def copy_directory(self, source, destination):
"""
Expand All @@ -75,14 +75,31 @@ def copy_directory(self, source, destination):
log.debug('Copying source directory %s to media storage at %s', source, destination)
source = Path(source)
for filepath in source.iterdir():
sub_destination = safe_join(destination, filepath.name)
sub_destination = self.join(destination, filepath.name)
if filepath.is_dir():
# Recursively copy the subdirectory
self.copy_directory(filepath, sub_destination)
elif filepath.is_file():
with filepath.open('rb') as fd:
self.save(sub_destination, fd)

def join(self, directory, filepath):
return safe_join(directory, filepath)

def walk(self, top):
if top in ('', '/'):
raise SuspiciousFileOperation('Iterating all storage cannot be right')

log.debug('Walking %s in media storage', top)
folders, files = self.listdir(self._dirpath(top))

yield top, folders, files

for folder_name in folders:
if folder_name:
# Recursively walk the subdirectory
yield from self.walk(self.join(top, folder_name))


class BuildMediaFileSystemStorage(BuildMediaStorageMixin, FileSystemStorage):

Expand Down
46 changes: 28 additions & 18 deletions readthedocs/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1231,26 +1231,36 @@ def get_processed_json(self):
Both lead to `foo/index.html`
https://github.com/rtfd/readthedocs.org/issues/5368
"""
fjson_paths = []
basename = os.path.splitext(self.path)[0]
fjson_paths.append(basename + '.fjson')
if basename.endswith('/index'):
new_basename = re.sub(r'\/index$', '', basename)
fjson_paths.append(new_basename + '.fjson')

full_json_path = self.project.get_production_media_path(
type_='json', version_slug=self.version.slug, include_file=False
)
try:
for fjson_path in fjson_paths:
file_path = os.path.join(full_json_path, fjson_path)
if os.path.exists(file_path):
return process_file(file_path)
except Exception:
file_path = None

if settings.RTD_BUILD_MEDIA_STORAGE:
storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()

fjson_paths = []
basename = os.path.splitext(self.path)[0]
fjson_paths.append(basename + '.fjson')
if basename.endswith('/index'):
new_basename = re.sub(r'\/index$', '', basename)
fjson_paths.append(new_basename + '.fjson')

storage_path = self.project.get_storage_path(
type_='json', version_slug=self.version.slug, include_file=False
)
try:
for fjson_path in fjson_paths:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe we should be able to get away from this soon. We should be starting to store the proper path of a file after readthedocs/readthedocs-sphinx-ext#62 is merged.

file_path = storage.join(storage_path, fjson_path)
if storage.exists(file_path):
return process_file(file_path)
except Exception:
log.warning(
'Unhandled exception during search processing file: %s',
file_path,
)
else:
log.warning(
'Unhandled exception during search processing file: %s',
file_path,
'Skipping HTMLFile processing because of no storage backend'
)

return {
'headers': [],
'content': '',
Expand Down
59 changes: 39 additions & 20 deletions readthedocs/projects/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1268,7 +1268,6 @@ def fileify(version_pk, commit, build):
)
return

path = project.rtd_build_path(version.slug)
log.info(
LOG_TEMPLATE,
{
Expand All @@ -1278,40 +1277,47 @@ def fileify(version_pk, commit, build):
}
)
try:
_manage_imported_files(version, path, commit, build)
_manage_imported_files(version, commit, build)
except Exception:
log.exception('Failed during ImportedFile creation')

try:
_update_intersphinx_data(version, path, commit, build)
_update_intersphinx_data(version, commit, build)
except Exception:
log.exception('Failed during SphinxDomain creation')


def _update_intersphinx_data(version, path, commit, build):
def _update_intersphinx_data(version, commit, build):
"""
Update intersphinx data for this version.

:param version: Version instance
:param path: Path to search
:param commit: Commit that updated path
:param build: Build id
"""

object_file = os.path.join(path, 'objects.inv')
if not os.path.exists(object_file):
log.debug('No objects.inv, skipping intersphinx indexing.')
if not settings.RTD_BUILD_MEDIA_STORAGE:
return

full_json_path = version.project.get_production_media_path(
storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()

html_storage_path = version.project.get_storage_path(
type_='html', version_slug=version.slug, include_file=False
)
json_storage_path = version.project.get_storage_path(
type_='json', version_slug=version.slug, include_file=False
)
type_file = os.path.join(full_json_path, 'readthedocs-sphinx-domain-names.json')

object_file = storage.join(html_storage_path, 'objects.inv')
if not storage.exists(object_file):
log.debug('No objects.inv, skipping intersphinx indexing.')
return

type_file = storage.join(json_storage_path, 'readthedocs-sphinx-domain-names.json')
types = {}
titles = {}
if os.path.exists(type_file):
if storage.exists(type_file):
try:
data = json.load(open(type_file))
data = json.load(storage.open(type_file))
types = data['types']
titles = data['titles']
except Exception:
Expand All @@ -1331,7 +1337,13 @@ def warn(self, msg):
log.warning('Sphinx MockApp: %s', msg)

# Re-create all objects from the new build of the version
invdata = intersphinx.fetch_inventory(MockApp(), '', object_file)
object_file_url = storage.url(object_file)
if object_file_url.startswith('/'):
# Filesystem backed storage simply prepends MEDIA_URL to the path to get the URL
# This can cause an issue if MEDIA_URL is not fully qualified
object_file_url = 'http://' + settings.PRODUCTION_DOMAIN + object_file_url

invdata = intersphinx.fetch_inventory(MockApp(), '', object_file_url)
for key, value in sorted(invdata.items() or {}):
domain, _type = key.split(':')
for name, einfo in sorted(value.items()):
Expand Down Expand Up @@ -1419,29 +1431,36 @@ def clean_build(version_pk):
return True


def _manage_imported_files(version, path, commit, build):
def _manage_imported_files(version, commit, build):
"""
Update imported files for version.

:param version: Version instance
:param path: Path to search
:param commit: Commit that updated path
:param build: Build id
"""

if not settings.RTD_BUILD_MEDIA_STORAGE:
return
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should likely log something here, so we don't get confused if indexing isn't working.


storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()

changed_files = set()

# Re-create all objects from the new build of the version
for root, __, filenames in os.walk(path):
storage_path = version.project.get_storage_path(
type_='html', version_slug=version.slug, include_file=False
)
for root, __, filenames in storage.walk(storage_path):
for filename in filenames:
if filename.endswith('.html'):
model_class = HTMLFile
else:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like to add the elif project.cdn_enabled here with an else: continue. 👍

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does project.cdn_enabled do exactly and what would it do differently for search?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It means we store all ImportedFile's not just HTMLFile's, so we can purge them from the CDN properly when they change.

model_class = ImportedFile

full_path = os.path.join(root, filename)
relpath = os.path.relpath(full_path, path)
relpath = storage.join(root, filename)
try:
md5 = hashlib.md5(open(full_path, 'rb').read()).hexdigest()
md5 = hashlib.md5(storage.open(relpath, 'rb').read()).hexdigest()
except Exception:
log.exception(
'Error while generating md5 for %s:%s:%s. Don\'t stop.',
Expand Down
11 changes: 5 additions & 6 deletions readthedocs/rtd_tests/tests/test_search_json_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,20 @@
import os

from django.test import TestCase
from django.test.utils import override_settings

from readthedocs.search.parse_json import process_file


base_dir = os.path.dirname(os.path.dirname(__file__))


class TestHacks(TestCase):

@override_settings(MEDIA_ROOT=base_dir)
def test_h2_parsing(self):
data = process_file(
os.path.join(
base_dir,
'files/api.fjson',
),
)
data = process_file('files/api.fjson')

self.assertEqual(data['sections'][1]['id'], 'a-basic-api-client-using-slumber')
# Only capture h2's after the first section
for obj in data['sections'][1:]:
Expand Down
22 changes: 14 additions & 8 deletions readthedocs/search/parse_json.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
"""Functions related to converting content into dict/JSON structures."""

import codecs
import json
import logging

from django.conf import settings
from django.core.files.storage import get_storage_class

from pyquery import PyQuery


Expand Down Expand Up @@ -59,13 +61,17 @@ def generate_sections_from_pyquery(body):
}


def process_file(fjson_filename):
def process_file(fjson_storage_path):
"""Read the fjson file from disk and parse it into a structured dict."""
storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()

log.debug('Processing JSON file for indexing: %s', fjson_storage_path)

try:
with codecs.open(fjson_filename, encoding='utf-8', mode='r') as f:
with storage.open(fjson_storage_path, mode='r') as f:
file_contents = f.read()
except IOError:
log.info('Unable to read file: %s', fjson_filename)
log.info('Unable to read file: %s', fjson_storage_path)
raise
data = json.loads(file_contents)
sections = []
Expand All @@ -76,24 +82,24 @@ def process_file(fjson_filename):
if 'current_page_name' in data:
path = data['current_page_name']
else:
log.info('Unable to index file due to no name %s', fjson_filename)
log.info('Unable to index file due to no name %s', fjson_storage_path)

if data.get('body'):
body = PyQuery(data['body'])
body_content = body.text().replace('¶', '')
sections.extend(generate_sections_from_pyquery(body))
else:
log.info('Unable to index content for: %s', fjson_filename)
log.info('Unable to index content for: %s', fjson_storage_path)

if 'title' in data:
title = data['title']
if title.startswith('<'):
title = PyQuery(data['title']).text()
else:
log.info('Unable to index title for: %s', fjson_filename)
log.info('Unable to index title for: %s', fjson_storage_path)

return {
'headers': process_headers(data, fjson_filename),
'headers': process_headers(data, fjson_storage_path),
'content': body_content,
'path': path,
'title': title,
Expand Down