Improvement for Schema([type]) validator performance
A schema that validates that a list contains instances of a single type i.e. Schema([str]) takes a surprising amount of time when given a large list. Specializing this would be worthwhile - I've got an example here of a ListOf schema validator that would do this as a proof of concept.
On my machine the timing for validating a list of a million strings goes from 5s to 0.3s with the ListOf(str) instead of the [str].
I could also imagine just special casing [type] in the main schema class itself - I'm happy to write a patch if either of these directions would be accepted upstream
import cProfile
import pstats
import io
from schema import And, Optional, Or, Schema, Use, SchemaError
class ListOf:
def __init__(self, type, error=None):
self._type = type
self._error = error
def __repr__(self):
return '%s(%r)' % (self.__class__.__name__, self._type)
def validate(self, data):
try:
for d in data:
if not isinstance(d, self._type):
raise SchemaError(['%s is of type %s not %s' % (d, type(d), self._type)], [self._error.format(d) if self._error else None])
return data
except SchemaError:
raise
except BaseException as x:
raise SchemaError(
'%s raised %r' % (str(self), x), self._error.format(data) if self._error else None)
test = ['foo' for _ in range(1000000)] + [42]
profiler = cProfile.Profile()
profiler.enable()
# Slow
try:
Schema([str]).validate(test)
except SchemaError as e:
print(e)
# Fast
try:
Schema(ListOf(str)).validate(test)
except SchemaError as e:
print(e)
profiler.disable()
s = io.StringIO()
ps = pstats.Stats(profiler, stream=s).sort_stats('tottime')
ps.print_stats()
print(s.getvalue())
import cProfile import pstats import io from schema import And, Optional, Or, Schema, Use, SchemaError
class ListOf: def init(self, type, error=None): self._type = type self._error = error
def __repr__(self):
return '%s(%r)' % (self.__class__.__name__, self._type)
def validate(self, data):
try:
for d in data:
if not isinstance(d, self._type):
raise SchemaError(['%s is of type %s not %s' % (d, type(d), self._type)], [self._error.format(d) if self._error else None])
return data
except SchemaError:
raise
except BaseException as x:
raise SchemaError(
'%s raised %r' % (str(self), x), self._error.format(data) if self._error else None)
test = ['foo' for _ in range(1000000)] + [42]
profiler = cProfile.Profile() profiler.enable()
Slow
try: Schema([str]).validate(test) except SchemaError as e: print(e)
Fast
try: Schema(ListOf(str)).validate(test) except SchemaError as e: print(e)
profiler.disable() s = io.StringIO() ps = pstats.Stats(profiler, stream=s).sort_stats('tottime') ps.print_stats() print(s.getvalue())