schema icon indicating copy to clipboard operation
schema copied to clipboard

Improvement for Schema([type]) validator performance

Open alexmac opened this issue 5 years ago • 1 comments

A schema that validates that a list contains instances of a single type i.e. Schema([str]) takes a surprising amount of time when given a large list. Specializing this would be worthwhile - I've got an example here of a ListOf schema validator that would do this as a proof of concept.

On my machine the timing for validating a list of a million strings goes from 5s to 0.3s with the ListOf(str) instead of the [str].

I could also imagine just special casing [type] in the main schema class itself - I'm happy to write a patch if either of these directions would be accepted upstream

import cProfile
import pstats
import io
from schema import And, Optional, Or, Schema, Use, SchemaError

class ListOf:
    def __init__(self, type, error=None):
        self._type = type
        self._error = error

    def __repr__(self):
        return '%s(%r)' % (self.__class__.__name__, self._type)

    def validate(self, data):
        try:
            for d in data:
                if not isinstance(d, self._type):
                    raise SchemaError(['%s is of type %s not %s' % (d, type(d), self._type)], [self._error.format(d) if self._error else None])
            return data
        except SchemaError:
        	raise
        except BaseException as x:
            raise SchemaError(
            	'%s raised %r' % (str(self), x), self._error.format(data) if self._error else None) 

test = ['foo' for _ in range(1000000)] + [42]

profiler = cProfile.Profile()
profiler.enable()

# Slow
try:
	Schema([str]).validate(test)
except SchemaError as e:
	print(e)

# Fast
try:
	Schema(ListOf(str)).validate(test)
except SchemaError as e:
	print(e)

profiler.disable()
s = io.StringIO()
ps = pstats.Stats(profiler, stream=s).sort_stats('tottime')
ps.print_stats()
print(s.getvalue())

alexmac avatar May 23 '20 22:05 alexmac

import cProfile import pstats import io from schema import And, Optional, Or, Schema, Use, SchemaError

class ListOf: def init(self, type, error=None): self._type = type self._error = error

def __repr__(self):
    return '%s(%r)' % (self.__class__.__name__, self._type)

def validate(self, data):
    try:
        for d in data:
            if not isinstance(d, self._type):
                raise SchemaError(['%s is of type %s not %s' % (d, type(d), self._type)], [self._error.format(d) if self._error else None])
        return data
    except SchemaError:
        raise
    except BaseException as x:
        raise SchemaError(
        	'%s raised %r' % (str(self), x), self._error.format(data) if self._error else None)

test = ['foo' for _ in range(1000000)] + [42]

profiler = cProfile.Profile() profiler.enable()

Slow

try: Schema([str]).validate(test) except SchemaError as e: print(e)

Fast

try: Schema(ListOf(str)).validate(test) except SchemaError as e: print(e)

profiler.disable() s = io.StringIO() ps = pstats.Stats(profiler, stream=s).sort_stats('tottime') ps.print_stats() print(s.getvalue())

ljluestc avatar Aug 27 '23 21:08 ljluestc