Skip to content

Save config on build model #4749

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 32 commits into from
Nov 7, 2018
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
6891db8
New field
stsewd Oct 11, 2018
b1b3620
Change API
stsewd Oct 11, 2018
27d8251
Implement as_dict to config
stsewd Oct 11, 2018
ab5ea04
One more test for the api
stsewd Oct 12, 2018
5f4fb8c
Save config on build
stsewd Oct 12, 2018
9d70812
Simple query
stsewd Oct 12, 2018
13abeda
Test for version.config property
stsewd Oct 12, 2018
98d26a4
Use callable as default
stsewd Oct 15, 2018
660c03a
Update migration
stsewd Oct 23, 2018
d29b6d9
Save on model only if there is a difference
stsewd Oct 23, 2018
7fc1e2b
Merge branch 'master' into save-config-on-build
stsewd Oct 23, 2018
8f62f03
More tests
stsewd Oct 23, 2018
76929ac
Linter
stsewd Oct 23, 2018
7ade872
One more test
stsewd Oct 23, 2018
6f8becc
Better approach for save config
stsewd Oct 23, 2018
07efa1b
Tests
stsewd Oct 23, 2018
715e5ed
Fix test for API
stsewd Oct 23, 2018
dab5860
Update docstrings
stsewd Oct 23, 2018
5aacd63
Merge branch 'master' into save-config-on-build
stsewd Oct 23, 2018
822b793
Check for previous only on first creation or if the object hast changed
stsewd Nov 2, 2018
8c60760
Merge branch 'master' into save-config-on-build
stsewd Nov 2, 2018
7ed110d
Rebase migration
stsewd Nov 2, 2018
2ffdbf6
Use just patch
stsewd Nov 2, 2018
5cce5d8
Docstring
stsewd Nov 5, 2018
9d8266f
Don't reference to empty builds
stsewd Nov 5, 2018
28cd935
Fix varname
stsewd Nov 5, 2018
433b709
Add test
stsewd Nov 5, 2018
b88837d
Docstring
stsewd Nov 5, 2018
4723705
Merge branch 'master' into save-config-on-build
stsewd Nov 5, 2018
043f4e6
Don't expose `_config` on the API
stsewd Nov 6, 2018
79fbb8e
Refactor as feedback
stsewd Nov 7, 2018
f33d54a
Merge branch 'master' into save-config-on-build
stsewd Nov 7, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions readthedocs/builds/migrations/0005_add_config_field.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.9.13 on 2018-10-11 14:00
from __future__ import unicode_literals

from django.db import migrations, models
import jsonfield.fields
import readthedocs.builds.version_slug


class Migration(migrations.Migration):

dependencies = [
('builds', '0004_add-apiversion-proxy-model'),
]

operations = [
migrations.AddField(
model_name='build',
name='config',
field=jsonfield.fields.JSONField(default={}, verbose_name='Configuration used in the build'),
),
migrations.AlterField(
model_name='build',
name='error',
field=models.TextField(blank=True, default='', verbose_name='Error'),
),
migrations.AlterField(
model_name='build',
name='output',
field=models.TextField(blank=True, default='', verbose_name='Output'),
),
migrations.AlterField(
model_name='build',
name='state',
field=models.CharField(choices=[('triggered', 'Triggered'), ('cloning', 'Cloning'), ('installing', 'Installing'), ('building', 'Building'), ('finished', 'Finished')], default='finished', max_length=55, verbose_name='State'),
),
migrations.AlterField(
model_name='build',
name='type',
field=models.CharField(choices=[('html', 'HTML'), ('pdf', 'PDF'), ('epub', 'Epub'), ('man', 'Manpage'), ('dash', 'Dash')], default='html', max_length=55, verbose_name='Type'),
),
migrations.AlterField(
model_name='version',
name='privacy_level',
field=models.CharField(choices=[('public', 'Public'), ('protected', 'Protected'), ('private', 'Private')], default='public', help_text='Level of privacy for this Version.', max_length=20, verbose_name='Privacy Level'),
),
migrations.AlterField(
model_name='version',
name='slug',
field=readthedocs.builds.version_slug.VersionSlugField(db_index=True, max_length=255, populate_from='verbose_name', verbose_name='Slug'),
),
migrations.AlterField(
model_name='version',
name='type',
field=models.CharField(choices=[('branch', 'Branch'), ('tag', 'Tag'), ('unknown', 'Unknown')], default='unknown', max_length=20, verbose_name='Type'),
),
migrations.AlterField(
model_name='versionalias',
name='from_slug',
field=models.CharField(default='', max_length=255, verbose_name='From slug'),
),
migrations.AlterField(
model_name='versionalias',
name='to_slug',
field=models.CharField(blank=True, default='', max_length=255, verbose_name='To slug'),
),
]
46 changes: 38 additions & 8 deletions readthedocs/builds/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,36 +2,56 @@
"""Models for the builds app."""

from __future__ import (
absolute_import, division, print_function, unicode_literals)
absolute_import,
division,
print_function,
unicode_literals,
)

import logging
import os.path
import re
from builtins import object
from shutil import rmtree

from builtins import object
from django.conf import settings
from django.core.urlresolvers import reverse
from django.db import models
from django.utils.encoding import python_2_unicode_compatible
from django.utils.translation import ugettext_lazy as _
from django.utils.translation import ugettext
from django.utils.translation import ugettext_lazy as _
from guardian.shortcuts import assign
from jsonfield import JSONField
from taggit.managers import TaggableManager

from readthedocs.core.utils import broadcast
from readthedocs.projects.constants import (
BITBUCKET_URL, GITHUB_URL, GITLAB_URL, PRIVACY_CHOICES, PRIVATE)
BITBUCKET_URL,
GITHUB_URL,
GITLAB_URL,
PRIVACY_CHOICES,
PRIVATE,
)
from readthedocs.projects.models import APIProject, Project

from .constants import (
BRANCH, BUILD_STATE, BUILD_STATE_FINISHED, BUILD_TYPES, LATEST,
NON_REPOSITORY_VERSIONS, STABLE, TAG, VERSION_TYPES)
BRANCH,
BUILD_STATE,
BUILD_STATE_FINISHED,
BUILD_TYPES,
LATEST,
NON_REPOSITORY_VERSIONS,
STABLE,
TAG,
VERSION_TYPES,
)
from .managers import VersionManager
from .querysets import BuildQuerySet, RelatedBuildQuerySet, VersionQuerySet
from .utils import (
get_bitbucket_username_repo, get_github_username_repo,
get_gitlab_username_repo)
get_bitbucket_username_repo,
get_github_username_repo,
get_gitlab_username_repo,
)
from .version_slug import VersionSlugField

DEFAULT_VERSION_PRIVACY_LEVEL = getattr(
Expand Down Expand Up @@ -109,6 +129,15 @@ def __str__(self):
pk=self.pk,
))

@property
def config(self):
last_build = (
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder how long this query will take. Our build table is pretty large, but perhaps the filtering will make it a bit faster. Feels like something we might want to denormalize if we use it a lot.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't use this, for now, we'll see if we can cache this maybe

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method is only used in the tests. Do we need this at all? Did you add it as a helper for debugging or so? Does it worth to keep it here? Where do you think it will be used?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will be used on the api and resolver, it'll be used on the second part of this PR

self.builds.filter(type='html', state='finished')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still use the type field? It seems old and unnecessary at this point.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup, I was thinking the same.

.order_by('-date')
.first()
)
return last_build.config

@property
def commit_name(self):
"""
Expand Down Expand Up @@ -448,6 +477,7 @@ class Build(models.Model):
exit_code = models.IntegerField(_('Exit code'), null=True, blank=True)
commit = models.CharField(
_('Commit'), max_length=255, null=True, blank=True)
config = JSONField(_('Configuration used in the build'), default={})
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like this, but I think this will increase a lot the size of the db with not very useful data (because it will become old/inaccurate pretty soon). On the other hand, moving this to the cold storage I think that doesn't make sense since finally we won't be able to query these models and get the data we want.

Because of this problem, I was suggesting to save this config in the Version object and always save the config of the latest build per version. We don't really care what was the config of a build 2 months ago. This way, we will have only one config JSON saved into our DB per (active) version and it will be always up to date.

Besides, answering the question: "how many project use mkdocs?" with the current approach is complicated since we will need to count only the latest build per version. In the end, I think that in many cases what I'm suggesting is what we want here.

I want to make this stored data useful, help us to debug problems but also to make decisions when we need to know how users are using our own platform: "how much used is the build/latest image?" and more questions.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using the JSON example generated from the test and checking how many builds we do have per day, I can estimate that we will have an increase of 1.5Mb per day.

In [13]: Build.objects.filter(date__lt=datetime.date(2018, 10, 15), date__gt=datetime.date(2018, 10, 14)).count()
Out[13]: 3278

In [14]: text_json = '{"version": "1", "formats": ["pdf"], "python": {"version": 3.5, "requirements": "requirements.txt", "install_with_pip": false, "install_with_setup": false, "extra_requi
    ...: rements": [], "use_system_site_packages": false}, "build": {"image": "readthedocs/build:2.0"}, "conda": null, "sphinx": {"builder": "sphinx", "configuration": null, "fail_on_warning
    ...: ": false}, "mkdocs": {"configuration": null, "fail_on_warning": false}, "doctype": "sphinx", "submodules": {"include": 1, "exclude": [], "recursive": true}}'
    ...: 

In [15]: len(text_json)
Out[15]: 505

In [16]: 505 * 3278 / 1024 ** 2
Out[16]: 1.5787029266357422

(please, verify my math and logic)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1.5MB per day feels like not that much storage, to be honest. It's less about 500MB a year, give or take. Of course, doing this kind of stuff is how we keep filling up our DB, so probably best not to take the naive approach.

I do see value in having recent changes of the config files stored. I wonder if there is a good way to store it on the Version, but keep a historical copy of all changes. We don't want to keep 500 copies of the same exact data structure, but being able to diff them over time feels very useful, especially when a build goes from passing to failing, or similar.

Copy link
Member

@ericholscher ericholscher Oct 15, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That might be overkill, perhaps we could just only save the config when it differs from the previously found one on the Build for that version? Something that limits data so we're not just storing the same data over and over.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if there is an easy way, I'd like to have this to show to users the configuration that was used on each build, if we don't save the full config we need to query the previous one before saving (this is easy) to see if they differ. If we want to show the config for a specific build, we need to know the previous one. So, maybe another model? Or maybe a foreign key to the previous Build?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I personally prefer to take an action about this before merging.

@ericholscher eric I like your proposal. I think it's "simple enough" to implement and shouldn't complicate things too much if we save "the whole config" if "we detect that differs", instead of saving a diff that later will be more complicated to generate the real config JSON text.

This way, we can query "what's the latest build of this project where the config attribute is not None" and that will be the config used for the current Build (and we can show it to the user as @stsewd suggested).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, Build has a date field, we can use that to query the previous build with a valid config

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using a mutable object as default it's a problem. You need to change it to dict function.

Reference: https://docs.djangoproject.com/es/2.1/ref/models/fields/#django.db.models.Field.default

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I read this rpkilby/jsonfield#125, but we can change it just to be 100% sure

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. Change it, please. That's not the default behavior..


length = models.IntegerField(_('Build Length'), null=True, blank=True)

Expand Down
15 changes: 15 additions & 0 deletions readthedocs/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,21 @@ def python_full_version(self):
)
return ver

def as_dict(self):
attributes = [
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if this logic should live somewhere else, as a PUBLIC_ATTRIBUTES or something that is more likely to be updated when changed/added?

'version', 'formats', 'python',
'conda', 'build', 'doctype',
'sphinx', 'mkdocs', 'submodules',
]
config = {}
for name in attributes:
attr = getattr(self, name)
if hasattr(attr, '_asdict'):
config[name] = attr._asdict()
else:
config[name] = attr
return config

def __getattr__(self, name):
"""Raise an error for unknown attributes."""
raise ConfigOptionNotSupportedError(name)
Expand Down
98 changes: 98 additions & 0 deletions readthedocs/config/tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -841,6 +841,60 @@ def test_config_filenames_regex(correct_config_filename):
assert re.match(CONFIG_FILENAME_REGEX, correct_config_filename)


def test_as_dict(tmpdir):
apply_fs(tmpdir, {'requirements.txt': ''})
build = get_build_config(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The name of the variable is tricky. This is not a build object, but a config object.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is a BuildConfig object, for historical reasons it was left like that p: I mean, we are using that variable name in all the tests of the config module, not sure if we want to rename all of them.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK.

Feel free to do whatever you prefer. In case you think this should be renamed, open an issue in the Cleanup milestone for the "future".

{
'version': 1,
'formats': ['pdf'],
'python': {
'version': 3.5,
},
'requirements_file': 'requirements.txt',
},
get_env_config({
'defaults': {
'doctype': 'sphinx',
'sphinx_configuration': None,
},
}),
source_file=str(tmpdir.join('readthedocs.yml')),
)
build.validate()
expected_dict = {
'version': '1',
'formats': ['pdf'],
'python': {
'version': 3.5,
'requirements': 'requirements.txt',
'install_with_pip': False,
'install_with_setup': False,
'extra_requirements': [],
'use_system_site_packages': False,
},
'build': {
'image': 'readthedocs/build:2.0',
},
'conda': None,
'sphinx': {
'builder': 'sphinx',
'configuration': None,
'fail_on_warning': False,
},
'mkdocs': {
'configuration': None,
'fail_on_warning': False,
},
'doctype': 'sphinx',
'submodules': {
'include': ALL,
'exclude': [],
'recursive': True,
},
}
assert build.as_dict() == expected_dict


class TestBuildConfigV2(object):

def get_build_config(self, config, env_config=None,
Expand Down Expand Up @@ -1804,3 +1858,47 @@ def test_pop_config_raise_exception(self):
build.pop_config('one.four', raise_ex=True)
assert excinfo.value.value == 'four'
assert excinfo.value.code == VALUE_NOT_FOUND

def test_as_dict(self, tmpdir):
apply_fs(tmpdir, {'requirements.txt': ''})
build = self.get_build_config(
{
'version': 2,
'formats': ['pdf'],
'python': {
'version': 3.6,
'requirements': 'requirements.txt',
},
},
source_file=str(tmpdir.join('readthedocs.yml')),
)
build.validate()
expected_dict = {
'version': '2',
'formats': ['pdf'],
'python': {
'version': 3.6,
'requirements': str(tmpdir.join('requirements.txt')),
'install_with_pip': False,
'install_with_setup': False,
'extra_requirements': [],
'use_system_site_packages': False,
},
'build': {
'image': 'readthedocs/build:latest',
},
'conda': None,
'sphinx': {
'builder': 'sphinx',
'configuration': None,
'fail_on_warning': False,
},
'mkdocs': None,
'doctype': 'sphinx',
'submodules': {
'include': [],
'exclude': ALL,
'recursive': False,
},
}
assert build.as_dict() == expected_dict
1 change: 1 addition & 0 deletions readthedocs/restapi/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ class BuildSerializer(serializers.ModelSerializer):
version_slug = serializers.ReadOnlyField(source='version.slug')
docs_url = serializers.ReadOnlyField(source='version.get_absolute_url')
state_display = serializers.ReadOnlyField(source='get_state_display')
config = serializers.JSONField(required=False)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need this because of rpkilby/jsonfield#188 (comment)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably should be a code comment


class Meta(object):
model = Build
Expand Down
1 change: 1 addition & 0 deletions readthedocs/rtd_tests/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def test_make_build(self):
self.assertEqual(resp.status_code, status.HTTP_201_CREATED)
build = resp.data
self.assertEqual(build['state_display'], 'Cloning')
self.assertEqual(build['config'], {})

resp = client.get('/api/v2/build/%s/' % build['id'])
self.assertEqual(resp.status_code, 200)
Expand Down