-
-
Notifications
You must be signed in to change notification settings - Fork 594
Perfornance - Cache expensive url operations #203
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2fda155
22701dc
812392b
613cf3e
d1e2448
ca59f3f
ee1a256
7241db0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
#!/usr/env/bin python | ||
""" | ||
Benchmark the performance of jsonschema. | ||
|
||
Example benchmark: | ||
|
||
wget http://swagger.io/v2/schema.json | ||
wget http://petstore.swagger.io/v2/swagger.json | ||
python bench.py -r 5 schema.json swagger.json | ||
|
||
""" | ||
from __future__ import print_function | ||
import argparse | ||
import cProfile | ||
import json | ||
import time | ||
|
||
import jsonschema | ||
|
||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('schema', help="path to a schema used to benchmark") | ||
parser.add_argument('document', help="document to validate with schema") | ||
parser.add_argument('-r', '--repeat', type=int, help="number of iterations") | ||
parser.add_argument('--profile', | ||
help="Enable profiling, write profile to this filepath") | ||
return parser.parse_args() | ||
|
||
|
||
def run(filename, schema, document): | ||
resolver = jsonschema.RefResolver( | ||
'file://{0}'.format(filename), | ||
schema, | ||
store={schema['id']: schema}) | ||
jsonschema.validate(document, schema, resolver=resolver) | ||
|
||
|
||
def format_time(time_): | ||
return "%.3fms" % (time_ * 1000) | ||
|
||
|
||
def run_timeit(schema_filename, document_filename, repeat, profile): | ||
with open(schema_filename) as schema_file: | ||
schema = json.load(schema_file) | ||
|
||
with open(document_filename) as fh: | ||
document = json.load(fh) | ||
|
||
if profile: | ||
profiler = cProfile.Profile() | ||
profiler.enable() | ||
|
||
times = [] | ||
for _ in range(repeat): | ||
start_time = time.time() | ||
run(schema_filename, schema, document) | ||
times.append(time.time() - start_time) | ||
|
||
if profile: | ||
profiler.disable() | ||
profiler.dump_stats(profile) | ||
|
||
print(", ".join(map(format_time, sorted(times)))) | ||
print("Mean: {0}".format(format_time(sum(times) / repeat))) | ||
|
||
|
||
def main(): | ||
args = parse_args() | ||
run_timeit(args.schema, args.document, args.repeat, args.profile) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,4 @@ | ||
from collections import deque | ||
from contextlib import contextmanager | ||
import json | ||
|
||
from jsonschema import FormatChecker, ValidationError | ||
|
@@ -633,12 +632,8 @@ def test_it_delegates_to_a_ref_resolver(self): | |
resolver = RefResolver("", {}) | ||
schema = {"$ref" : mock.Mock()} | ||
|
||
@contextmanager | ||
def resolving(): | ||
yield {"type": "integer"} | ||
|
||
with mock.patch.object(resolver, "resolving") as resolve: | ||
resolve.return_value = resolving() | ||
with mock.patch.object(resolver, "resolve") as resolve: | ||
resolve.return_value = "url", {"type": "integer"} | ||
with self.assertRaises(ValidationError): | ||
self.validator_class(schema, resolver=resolver).validate(None) | ||
|
||
|
@@ -775,11 +770,11 @@ def test_it_resolves_local_refs(self): | |
self.assertEqual(resolved, self.referrer["properties"]["foo"]) | ||
|
||
def test_it_resolves_local_refs_with_id(self): | ||
schema = {"id": "foo://bar/schema#", "a": {"foo": "bar"}} | ||
schema = {"id": "http://bar/schema#", "a": {"foo": "bar"}} | ||
resolver = RefResolver.from_schema(schema) | ||
with resolver.resolving("#/a") as resolved: | ||
self.assertEqual(resolved, schema["a"]) | ||
with resolver.resolving("foo://bar/schema#/a") as resolved: | ||
with resolver.resolving("http://bar/schema#/a") as resolved: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I had to change these to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah there's some global that defines weirdly which schemes it's fine with. I don't recall why this worked before then? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe it worked before because it wasn't joining the ref to the rest of the url. The url was being fetched first, then the ref was being used to locate the section in the document. Now the full url is being stored as the scope instead. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, OK, makes sense. |
||
self.assertEqual(resolved, schema["a"]) | ||
|
||
def test_it_retrieves_stored_refs(self): | ||
|
@@ -816,6 +811,7 @@ def test_it_can_construct_a_base_uri_from_a_schema(self): | |
schema = {"id" : "foo"} | ||
resolver = RefResolver.from_schema(schema) | ||
self.assertEqual(resolver.base_uri, "foo") | ||
self.assertEqual(resolver.resolution_scope, "foo") | ||
with resolver.resolving("") as resolved: | ||
self.assertEqual(resolved, schema) | ||
with resolver.resolving("#") as resolved: | ||
|
@@ -829,6 +825,7 @@ def test_it_can_construct_a_base_uri_from_a_schema_without_id(self): | |
schema = {} | ||
resolver = RefResolver.from_schema(schema) | ||
self.assertEqual(resolver.base_uri, "") | ||
self.assertEqual(resolver.resolution_scope, "") | ||
with resolver.resolving("") as resolved: | ||
self.assertEqual(resolved, schema) | ||
with resolver.resolving("#") as resolved: | ||
|
@@ -863,9 +860,7 @@ def test_cache_remote_off(self): | |
) | ||
with resolver.resolving(ref): | ||
pass | ||
with resolver.resolving(ref): | ||
pass | ||
self.assertEqual(foo_handler.call_count, 2) | ||
self.assertEqual(foo_handler.call_count, 1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. With caching this is no longer called twice, so I had to set the count to 1. |
||
|
||
def test_if_you_give_it_junk_you_get_a_resolution_error(self): | ||
ref = "foo://bar" | ||
|
@@ -876,6 +871,13 @@ def test_if_you_give_it_junk_you_get_a_resolution_error(self): | |
pass | ||
self.assertEqual(str(err.exception), "Oh no! What's this?") | ||
|
||
def test_helpful_error_message_on_failed_pop_scope(self): | ||
resolver = RefResolver("", {}) | ||
resolver.pop_scope() | ||
with self.assertRaises(RefResolutionError) as exc: | ||
resolver.pop_scope() | ||
self.assertIn("Failed to pop the scope", str(exc.exception)) | ||
|
||
|
||
def sorted_errors(errors): | ||
def key(error): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,7 +12,7 @@ | |
from jsonschema import _utils, _validators | ||
from jsonschema.compat import ( | ||
Sequence, urljoin, urlsplit, urldefrag, unquote, urlopen, | ||
str_types, int_types, iteritems, | ||
str_types, int_types, iteritems, lru_cache, | ||
) | ||
from jsonschema.exceptions import ErrorTree # Backwards compatibility # noqa | ||
from jsonschema.exceptions import RefResolutionError, SchemaError, UnknownType | ||
|
@@ -79,7 +79,10 @@ def iter_errors(self, instance, _schema=None): | |
if _schema is None: | ||
_schema = self.schema | ||
|
||
with self.resolver.in_scope(_schema.get(u"id", u"")): | ||
scope = _schema.get(u"id") | ||
if scope: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably want There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think "" is relevant, here's my simple test cases: In [7]: urljoin("", "#/path")
Out[7]: '#/path'
In [8]: urljoin("", "http://example.com/#/path")
Out[8]: 'http://example.com/#/path'
In [9]: urljoin("#/path", "")
Out[9]: '#/path'
In [10]: urljoin("http://example.com/#/path", "")
Out[10]: 'http://example.com/#/path'
In [11]: urljoin("http://example.com/", "")
Out[11]: 'http://example.com/' |
||
self.resolver.push_scope(scope) | ||
try: | ||
ref = _schema.get(u"$ref") | ||
if ref is not None: | ||
validators = [(u"$ref", ref)] | ||
|
@@ -103,6 +106,9 @@ def iter_errors(self, instance, _schema=None): | |
if k != u"$ref": | ||
error.schema_path.appendleft(k) | ||
yield error | ||
finally: | ||
if scope: | ||
self.resolver.pop_scope() | ||
|
||
def descend(self, instance, schema, path=None, schema_path=None): | ||
for error in self.iter_errors(instance, schema): | ||
|
@@ -227,26 +233,32 @@ class RefResolver(object): | |
first resolution | ||
:argument dict handlers: a mapping from URI schemes to functions that | ||
should be used to retrieve them | ||
|
||
:arguments callable cache_func: a function decorator used to cache | ||
expensive calls. Should support the `functools.lru_cache` interface. | ||
:argument int cache_maxsize: number of items to store in the cache. Set | ||
this to 0 to disable caching. Defaults to 1000. | ||
""" | ||
|
||
def __init__( | ||
self, base_uri, referrer, store=(), cache_remote=True, handlers=(), | ||
cache_func=lru_cache, cache_maxsize=1000, | ||
): | ||
self.base_uri = base_uri | ||
self.resolution_scope = base_uri | ||
# This attribute is not used, it is for backwards compatibility | ||
self.referrer = referrer | ||
self.cache_remote = cache_remote | ||
self.handlers = dict(handlers) | ||
|
||
self._scopes_stack = [base_uri] | ||
self.store = _utils.URIDict( | ||
(id, validator.META_SCHEMA) | ||
for id, validator in iteritems(meta_schemas) | ||
) | ||
self.store.update(store) | ||
self.store[base_uri] = referrer | ||
|
||
self._urljoin_cache = cache_func(cache_maxsize)(urljoin) | ||
self._resolve_cache = cache_func(cache_maxsize)(self.resolve_from_url) | ||
|
||
@classmethod | ||
def from_schema(cls, schema, *args, **kwargs): | ||
""" | ||
|
@@ -259,44 +271,72 @@ def from_schema(cls, schema, *args, **kwargs): | |
|
||
return cls(schema.get(u"id", u""), schema, *args, **kwargs) | ||
|
||
def push_scope(self, scope): | ||
self._scopes_stack.append( | ||
self._urljoin_cache(self.resolution_scope, scope)) | ||
|
||
def pop_scope(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment about making these methods private, unless there's a reason you can think of to not (although I thikn this question is moot if we consider the composition route I mentioned above) -- certainly if it's not private we probably should handle errors a bit nicer, which might make sense regardless. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since these (pop/push) are being called from The errors here would be for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ech yep, right again. OK. And yes to the error -- it's true it's unlikely, but if it's public API, someone else can use these, and do so incorrectly. Not a huge deal though. |
||
try: | ||
self._scopes_stack.pop() | ||
except IndexError: | ||
raise RefResolutionError( | ||
"Failed to pop the scope from an empty stack. " | ||
"`pop_scope()` should only be called once for every " | ||
"`push_scope()`") | ||
|
||
@property | ||
def resolution_scope(self): | ||
return self._scopes_stack[-1] | ||
|
||
# backwards compatibility | ||
@property | ||
def base_uri(self): | ||
uri, _ = urldefrag(self.resolution_scope) | ||
return uri | ||
|
||
# Deprecated, this function is no longer used, but is preserved for | ||
# backwards compatibility | ||
@contextlib.contextmanager | ||
def in_scope(self, scope): | ||
old_scope = self.resolution_scope | ||
self.resolution_scope = urljoin(old_scope, scope) | ||
self.push_scope(scope) | ||
try: | ||
yield | ||
finally: | ||
self.resolution_scope = old_scope | ||
self.pop_scope() | ||
|
||
# Deprecated, this function is no longer used, but is preserved for | ||
# backwards compatibility | ||
@contextlib.contextmanager | ||
def resolving(self, ref): | ||
url, resolved = self.resolve(ref) | ||
self.push_scope(url) | ||
try: | ||
yield resolved | ||
finally: | ||
self.pop_scope() | ||
|
||
def resolve(self, ref): | ||
""" | ||
Context manager which resolves a JSON ``ref`` and enters the | ||
resolution scope of this ref. | ||
|
||
:argument str ref: reference to resolve | ||
|
||
""" | ||
url = self._urljoin_cache(self.resolution_scope, ref) | ||
return url, self._resolve_cache(url) | ||
|
||
full_uri = urljoin(self.resolution_scope, ref) | ||
uri, fragment = urldefrag(full_uri) | ||
if not uri: | ||
uri = self.base_uri | ||
|
||
if uri in self.store: | ||
document = self.store[uri] | ||
else: | ||
def resolve_from_url(self, url): | ||
url, fragment = urldefrag(url) | ||
try: | ||
document = self.store[url] | ||
except KeyError: | ||
try: | ||
document = self.resolve_remote(uri) | ||
document = self.resolve_remote(url) | ||
except Exception as exc: | ||
raise RefResolutionError(exc) | ||
|
||
old_base_uri, self.base_uri = self.base_uri, uri | ||
try: | ||
with self.in_scope(uri): | ||
yield self.resolve_fragment(document, fragment) | ||
finally: | ||
self.base_uri = old_base_uri | ||
return self.resolve_fragment(document, fragment) | ||
|
||
def resolve_fragment(self, document, fragment): | ||
""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
__version__ = "2.5.0-dev" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's another backwards incompatibility here, which is that these are newly introduced methods on the
RefResolver
interface that people who might have implemented them did not need until now.To resolve that, I think we'd either need to fall back to
resolving
if there was nopush_scope
attribute, or perhaps you might consider an interface that composesRefResolver
rather than modifyingRefResolver
itself. Lemme know if this makes sense.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The performance improvement here was to remove the contextmanager. I'm not sure how to compose over that change. The interfaces seem incompatible to me (but I could be missing something).
Falling back to the old interface would be easy enough:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just curious -- did we benchmark whether the reason it's slow is a context manager vs specifically
functools.contextmanager
?But yeah, sorry, you're right -- although you don't need to do that check here per se every time you hit this function, you can check to see if you have a new or old
RefResolver
in__init__
for the validator and bind accordingly?EDIT: erm no, never mind, this is outside the validator, so yeah you'd have to do it every time. OK, fine, should be alright. So never mind this comment :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I did experiment with using a class with
__enter__
and__exit__
and it was about the same ascontextlib.contextmanager
(possibly even a little slower). It's always possible I was doing something non-optimal.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Got it. OK, I'd assume there's some cost in generator-nonsense with
contextmanager
, but I'm not sure I'd have guessed it'd be insignificant to switch to the regular protocol. Regardless, this way is fine with me, but if I have an overabundance of time (ha) I'll double check.