Skip to content

Perfornance - Cache expensive url operations #203

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Apr 6, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/usr/env/bin python
"""
Benchmark the performance of jsonschema.

Example benchmark:

wget http://swagger.io/v2/schema.json
wget http://petstore.swagger.io/v2/swagger.json
python bench.py -r 5 schema.json swagger.json

"""
from __future__ import print_function
import argparse
import cProfile
import json
import time

import jsonschema


def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('schema', help="path to a schema used to benchmark")
parser.add_argument('document', help="document to validate with schema")
parser.add_argument('-r', '--repeat', type=int, help="number of iterations")
parser.add_argument('--profile',
help="Enable profiling, write profile to this filepath")
return parser.parse_args()


def run(filename, schema, document):
resolver = jsonschema.RefResolver(
'file://{0}'.format(filename),
schema,
store={schema['id']: schema})
jsonschema.validate(document, schema, resolver=resolver)


def format_time(time_):
return "%.3fms" % (time_ * 1000)


def run_timeit(schema_filename, document_filename, repeat, profile):
with open(schema_filename) as schema_file:
schema = json.load(schema_file)

with open(document_filename) as fh:
document = json.load(fh)

if profile:
profiler = cProfile.Profile()
profiler.enable()

times = []
for _ in range(repeat):
start_time = time.time()
run(schema_filename, schema, document)
times.append(time.time() - start_time)

if profile:
profiler.disable()
profiler.dump_stats(profile)

print(", ".join(map(format_time, sorted(times))))
print("Mean: {0}".format(format_time(sum(times) / repeat)))


def main():
args = parse_args()
run_timeit(args.schema, args.document, args.repeat, args.profile)


if __name__ == "__main__":
main()
4 changes: 1 addition & 3 deletions jsonschema/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@
Draft3Validator, Draft4Validator, RefResolver, validate
)


__version__ = "2.5.0-dev"

from jsonschema.version import __version__

# flake8: noqa
7 changes: 6 additions & 1 deletion jsonschema/_validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,9 +190,14 @@ def enum(validator, enums, instance, schema):


def ref(validator, ref, instance, schema):
with validator.resolver.resolving(ref) as resolved:
scope, resolved = validator.resolver.resolve(ref)
validator.resolver.push_scope(scope)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's another backwards incompatibility here, which is that these are newly introduced methods on the RefResolver interface that people who might have implemented them did not need until now.

To resolve that, I think we'd either need to fall back to resolving if there was no push_scope attribute, or perhaps you might consider an interface that composes RefResolver rather than modifying RefResolver itself. Lemme know if this makes sense.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The performance improvement here was to remove the contextmanager. I'm not sure how to compose over that change. The interfaces seem incompatible to me (but I could be missing something).

Falling back to the old interface would be easy enough:

if hasattr(validator.resolver, 'resolve'):
    ... # do what this does now
    return

... # Use the old contextmanager way

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just curious -- did we benchmark whether the reason it's slow is a context manager vs specifically functools.contextmanager?

But yeah, sorry, you're right -- although you don't need to do that check here per se every time you hit this function, you can check to see if you have a new or old RefResolver in __init__ for the validator and bind accordingly?

EDIT: erm no, never mind, this is outside the validator, so yeah you'd have to do it every time. OK, fine, should be alright. So never mind this comment :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did experiment with using a class with __enter__ and __exit__ and it was about the same as contextlib.contextmanager (possibly even a little slower). It's always possible I was doing something non-optimal.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it. OK, I'd assume there's some cost in generator-nonsense with contextmanager, but I'm not sure I'd have guessed it'd be insignificant to switch to the regular protocol. Regardless, this way is fine with me, but if I have an overabundance of time (ha) I'll double check.


try:
for error in validator.descend(instance, resolved):
yield error
finally:
validator.resolver.pop_scope()


def type_draft3(validator, types, instance, schema):
Expand Down
6 changes: 5 additions & 1 deletion jsonschema/compat.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from __future__ import unicode_literals
import sys

import operator
import sys


try:
from collections import MutableMapping, Sequence # noqa
Expand All @@ -11,6 +13,7 @@

if PY3:
zip = zip
from functools import lru_cache
from io import StringIO
from urllib.parse import (
unquote, urljoin, urlunsplit, SplitResult, urlsplit as _urlsplit
Expand All @@ -21,6 +24,7 @@
iteritems = operator.methodcaller("items")
else:
from itertools import izip as zip # noqa
from repoze.lru import lru_cache
from StringIO import StringIO
from urlparse import (
urljoin, urlunsplit, SplitResult, urlsplit as _urlsplit # noqa
Expand Down
26 changes: 14 additions & 12 deletions jsonschema/tests/test_validators.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from collections import deque
from contextlib import contextmanager
import json

from jsonschema import FormatChecker, ValidationError
Expand Down Expand Up @@ -633,12 +632,8 @@ def test_it_delegates_to_a_ref_resolver(self):
resolver = RefResolver("", {})
schema = {"$ref" : mock.Mock()}

@contextmanager
def resolving():
yield {"type": "integer"}

with mock.patch.object(resolver, "resolving") as resolve:
resolve.return_value = resolving()
with mock.patch.object(resolver, "resolve") as resolve:
resolve.return_value = "url", {"type": "integer"}
with self.assertRaises(ValidationError):
self.validator_class(schema, resolver=resolver).validate(None)

Expand Down Expand Up @@ -775,11 +770,11 @@ def test_it_resolves_local_refs(self):
self.assertEqual(resolved, self.referrer["properties"]["foo"])

def test_it_resolves_local_refs_with_id(self):
schema = {"id": "foo://bar/schema#", "a": {"foo": "bar"}}
schema = {"id": "http://bar/schema#", "a": {"foo": "bar"}}
resolver = RefResolver.from_schema(schema)
with resolver.resolving("#/a") as resolved:
self.assertEqual(resolved, schema["a"])
with resolver.resolving("foo://bar/schema#/a") as resolved:
with resolver.resolving("http://bar/schema#/a") as resolved:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had to change these to http because (at least on py3) the behaviour of urljoin changes based on the scheme. With an unknown scheme it was not replacing the fragment at all.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah there's some global that defines weirdly which schemes it's fine with. I don't recall why this worked before then?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe it worked before because it wasn't joining the ref to the rest of the url. The url was being fetched first, then the ref was being used to locate the section in the document. Now the full url is being stored as the scope instead.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, OK, makes sense.

self.assertEqual(resolved, schema["a"])

def test_it_retrieves_stored_refs(self):
Expand Down Expand Up @@ -816,6 +811,7 @@ def test_it_can_construct_a_base_uri_from_a_schema(self):
schema = {"id" : "foo"}
resolver = RefResolver.from_schema(schema)
self.assertEqual(resolver.base_uri, "foo")
self.assertEqual(resolver.resolution_scope, "foo")
with resolver.resolving("") as resolved:
self.assertEqual(resolved, schema)
with resolver.resolving("#") as resolved:
Expand All @@ -829,6 +825,7 @@ def test_it_can_construct_a_base_uri_from_a_schema_without_id(self):
schema = {}
resolver = RefResolver.from_schema(schema)
self.assertEqual(resolver.base_uri, "")
self.assertEqual(resolver.resolution_scope, "")
with resolver.resolving("") as resolved:
self.assertEqual(resolved, schema)
with resolver.resolving("#") as resolved:
Expand Down Expand Up @@ -863,9 +860,7 @@ def test_cache_remote_off(self):
)
with resolver.resolving(ref):
pass
with resolver.resolving(ref):
pass
self.assertEqual(foo_handler.call_count, 2)
self.assertEqual(foo_handler.call_count, 1)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With caching this is no longer called twice, so I had to set the count to 1.


def test_if_you_give_it_junk_you_get_a_resolution_error(self):
ref = "foo://bar"
Expand All @@ -876,6 +871,13 @@ def test_if_you_give_it_junk_you_get_a_resolution_error(self):
pass
self.assertEqual(str(err.exception), "Oh no! What's this?")

def test_helpful_error_message_on_failed_pop_scope(self):
resolver = RefResolver("", {})
resolver.pop_scope()
with self.assertRaises(RefResolutionError) as exc:
resolver.pop_scope()
self.assertIn("Failed to pop the scope", str(exc.exception))


def sorted_errors(errors):
def key(error):
Expand Down
86 changes: 63 additions & 23 deletions jsonschema/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from jsonschema import _utils, _validators
from jsonschema.compat import (
Sequence, urljoin, urlsplit, urldefrag, unquote, urlopen,
str_types, int_types, iteritems,
str_types, int_types, iteritems, lru_cache,
)
from jsonschema.exceptions import ErrorTree # Backwards compatibility # noqa
from jsonschema.exceptions import RefResolutionError, SchemaError, UnknownType
Expand Down Expand Up @@ -79,7 +79,10 @@ def iter_errors(self, instance, _schema=None):
if _schema is None:
_schema = self.schema

with self.resolver.in_scope(_schema.get(u"id", u"")):
scope = _schema.get(u"id")
if scope:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably want is not None? Which might mean a missing test case? (an id of "" is probably meaningless, but I guess it means something reasonable relative to some other base URI.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think "" is relevant, here's my simple test cases:

In [7]: urljoin("", "#/path")
Out[7]: '#/path'

In [8]: urljoin("", "http://example.com/#/path")
Out[8]: 'http://example.com/#/path'

In [9]: urljoin("#/path", "")
Out[9]: '#/path'

In [10]: urljoin("http://example.com/#/path", "")
Out[10]: 'http://example.com/#/path'

In [11]: urljoin("http://example.com/", "")
Out[11]: 'http://example.com/'

self.resolver.push_scope(scope)
try:
ref = _schema.get(u"$ref")
if ref is not None:
validators = [(u"$ref", ref)]
Expand All @@ -103,6 +106,9 @@ def iter_errors(self, instance, _schema=None):
if k != u"$ref":
error.schema_path.appendleft(k)
yield error
finally:
if scope:
self.resolver.pop_scope()

def descend(self, instance, schema, path=None, schema_path=None):
for error in self.iter_errors(instance, schema):
Expand Down Expand Up @@ -227,26 +233,32 @@ class RefResolver(object):
first resolution
:argument dict handlers: a mapping from URI schemes to functions that
should be used to retrieve them

:arguments callable cache_func: a function decorator used to cache
expensive calls. Should support the `functools.lru_cache` interface.
:argument int cache_maxsize: number of items to store in the cache. Set
this to 0 to disable caching. Defaults to 1000.
"""

def __init__(
self, base_uri, referrer, store=(), cache_remote=True, handlers=(),
cache_func=lru_cache, cache_maxsize=1000,
):
self.base_uri = base_uri
self.resolution_scope = base_uri
# This attribute is not used, it is for backwards compatibility
self.referrer = referrer
self.cache_remote = cache_remote
self.handlers = dict(handlers)

self._scopes_stack = [base_uri]
self.store = _utils.URIDict(
(id, validator.META_SCHEMA)
for id, validator in iteritems(meta_schemas)
)
self.store.update(store)
self.store[base_uri] = referrer

self._urljoin_cache = cache_func(cache_maxsize)(urljoin)
self._resolve_cache = cache_func(cache_maxsize)(self.resolve_from_url)

@classmethod
def from_schema(cls, schema, *args, **kwargs):
"""
Expand All @@ -259,44 +271,72 @@ def from_schema(cls, schema, *args, **kwargs):

return cls(schema.get(u"id", u""), schema, *args, **kwargs)

def push_scope(self, scope):
self._scopes_stack.append(
self._urljoin_cache(self.resolution_scope, scope))

def pop_scope(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment about making these methods private, unless there's a reason you can think of to not (although I thikn this question is moot if we consider the composition route I mentioned above) -- certainly if it's not private we probably should handle errors a bit nicer, which might make sense regardless.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since these (pop/push) are being called from ref() I think they need to remain public. I'm still not sure about the composition route.

The errors here would be for pop() on an empty stack? Theoretically that should never happen because we always match push/pop, but a nicer error won't hurt either.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ech yep, right again. OK.

And yes to the error -- it's true it's unlikely, but if it's public API, someone else can use these, and do so incorrectly. Not a huge deal though.

try:
self._scopes_stack.pop()
except IndexError:
raise RefResolutionError(
"Failed to pop the scope from an empty stack. "
"`pop_scope()` should only be called once for every "
"`push_scope()`")

@property
def resolution_scope(self):
return self._scopes_stack[-1]

# backwards compatibility
@property
def base_uri(self):
uri, _ = urldefrag(self.resolution_scope)
return uri

# Deprecated, this function is no longer used, but is preserved for
# backwards compatibility
@contextlib.contextmanager
def in_scope(self, scope):
old_scope = self.resolution_scope
self.resolution_scope = urljoin(old_scope, scope)
self.push_scope(scope)
try:
yield
finally:
self.resolution_scope = old_scope
self.pop_scope()

# Deprecated, this function is no longer used, but is preserved for
# backwards compatibility
@contextlib.contextmanager
def resolving(self, ref):
url, resolved = self.resolve(ref)
self.push_scope(url)
try:
yield resolved
finally:
self.pop_scope()

def resolve(self, ref):
"""
Context manager which resolves a JSON ``ref`` and enters the
resolution scope of this ref.

:argument str ref: reference to resolve

"""
url = self._urljoin_cache(self.resolution_scope, ref)
return url, self._resolve_cache(url)

full_uri = urljoin(self.resolution_scope, ref)
uri, fragment = urldefrag(full_uri)
if not uri:
uri = self.base_uri

if uri in self.store:
document = self.store[uri]
else:
def resolve_from_url(self, url):
url, fragment = urldefrag(url)
try:
document = self.store[url]
except KeyError:
try:
document = self.resolve_remote(uri)
document = self.resolve_remote(url)
except Exception as exc:
raise RefResolutionError(exc)

old_base_uri, self.base_uri = self.base_uri, uri
try:
with self.in_scope(uri):
yield self.resolve_fragment(document, fragment)
finally:
self.base_uri = old_base_uri
return self.resolve_fragment(document, fragment)

def resolve_fragment(self, document, fragment):
"""
Expand Down
1 change: 1 addition & 0 deletions jsonschema/version.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__version__ = "2.5.0-dev"
15 changes: 13 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import os.path
from setuptools import setup
import sys

from jsonschema import __version__

# Load __version__ info globals without importing anything
with open(
os.path.join(os.path.dirname(__file__), 'jsonschema', 'version.py')
) as fh:
exec(fh.read())

with open("README.rst") as readme:
long_description = readme.read()
Expand All @@ -21,6 +26,11 @@
"Programming Language :: Python :: Implementation :: PyPy",
]

install_requires = []

if sys.version_info < (3, 2):
install_requires.append('repoze.lru >= 0.6')

setup(
name="jsonschema",
version=__version__,
Expand All @@ -34,4 +44,5 @@
long_description=long_description,
url="http://github.com/Julian/jsonschema",
entry_points={"console_scripts": ["jsonschema = jsonschema.cli:main"]},
install_requires=install_requires,
)