performance of run_state_machine_as_test drops drastically as a function of the number of state machine rules
Here's a synthetic example demonstrating the performance issue:
#!/usr/bin/env python
import time
import operator
import hypothesis.strategies as st
from hypothesis import settings, Verbosity
from hypothesis.stateful import (
Bundle,
RuleBasedStateMachine,
rule,
run_state_machine_as_test,
)
from tqdm.auto import tqdm
def run_test(num_dynamic_rules: int, num_examples: int):
"""
Defines, configures, and runs a state machine test for a given
number of rules and examples.
Returns:
float: The measured iterations per second.
"""
# A new TQDM progress bar for this specific run
pbar = tqdm(total=num_examples, desc=f"Testing {num_dynamic_rules} rules")
count = 0
def inc():
nonlocal count
count += 1
# To ensure test isolation, the StateMachine class is defined *inside*
# this function. This creates a new, clean class for each run.
class DynamicRuleMachine(RuleBasedStateMachine):
"""
A state machine that is dynamically configured for each test run.
"""
values = Bundle("values")
def __init__(self):
super().__init__()
# self.internal_state = []
# Associate this machine instance with the correct progress bar
self.pbar = pbar
# Reset progress bar for each new state machine instance
# if self.pbar.n > 0:
# self.pbar.reset()
def _update_pbar(self, op_name: str):
"""Helper to update the TQDM progress bar."""
self.pbar.set_postfix_str(f"op: {op_name}")
self.pbar.update(1)
@rule(target=values, n=st.integers(min_value=-100, max_value=100))
def create_initial_integer(self, n):
self._update_pbar("create_initial_integer")
inc()
# self.internal_state.append(n)
return n
@classmethod
def add_dynamic_rules(cls, num_rules_to_add: int):
"""Dynamically creates and attaches new rules to this class."""
operations = [operator.add, operator.sub, operator.mul]
for i in range(num_rules_to_add):
op = operations[i % len(operations)]
rule_method = cls._create_rule_method(op, i)
rule_name = f"dynamic_rule_{i}_{op.__name__}"
setattr(cls, rule_name, rule_method)
@classmethod
def _create_rule_method(cls, op, rule_index):
"""Factory to create a rule method with the correct scope."""
def dynamic_rule_body(self, v1, v2):
op_name = f"rule_{rule_index}_{op.__name__}"
self._update_pbar(op_name)
inc()
result = op(v1, v2)
# self.internal_state.append(result)
return result
return rule(target=cls.values, v1=cls.values, v2=cls.values)(
dynamic_rule_body
)
# --- Test Execution for this configuration ---
# 1. Add the specified number of rules to the fresh class
DynamicRuleMachine.add_dynamic_rules(num_dynamic_rules)
# 2. Configure Hypothesis settings
hypo_settings = settings(
max_examples=num_examples,
deadline=None,
verbosity=Verbosity.quiet, # Keep the output clean during the run
)
# 3. Run the state machine test and measure time
start_time = time.perf_counter()
run_state_machine_as_test(DynamicRuleMachine, settings=hypo_settings)
end_time = time.perf_counter()
pbar.close()
total_time = end_time - start_time
return count / total_time if total_time > 0 else float("inf")
def main():
"""
Orchestrates multiple test runs to show how performance scales with the
number of rules and prints a summary report.
"""
# Define the configurations to test
rule_counts = [10, 50, 100, 150, 200, 1000, 10_000, 50_000]
N_EXAMPLES_PER_RUN = 100 # Use a consistent number for a fair comparison
results = []
print(f"🚀 Starting performance scaling tests for Hypothesis...")
print(f"Each test will run for {N_EXAMPLES_PER_RUN:,} examples.")
for n_rules in rule_counts:
print("-" * 50)
iter_per_sec = run_test(
num_dynamic_rules=n_rules, num_examples=N_EXAMPLES_PER_RUN
)
results.append({"rules": n_rules, "iter_p_s": iter_per_sec})
# --- Final Report ---
print("\n" + "=" * 50)
print("✅ Performance Scaling Summary")
print("=" * 50)
print(f"{'# of Rules':<15} | {'Iterations / sec':<20}")
print(f"{'-'*15:}-+-{'-'*20:}")
for res in results:
print(f"{res['rules']:<15} | {res['iter_p_s']:>20,.2f}")
print("-" * 50)
if __name__ == "__main__":
main()
When I run this, I see:
🚀 Starting performance scaling tests for Hypothesis...
Each test will run for 100 examples.
--------------------------------------------------
Testing 10 rules: 5000it [00:01, 3536.33it/s, op: create_initial_integer]
--------------------------------------------------
Testing 50 rules: 4970it [00:02, 2015.75it/s, op: create_initial_integer]
--------------------------------------------------
Testing 100 rules: 5000it [00:04, 1050.11it/s, op: rule_29_mul]
--------------------------------------------------
Testing 150 rules: 4506it [00:05, 774.94it/s, op: rule_108_add]
--------------------------------------------------
Testing 200 rules: 4526it [00:05, 803.98it/s, op: rule_156_add]
--------------------------------------------------
Testing 1000 rules: 3386it [00:06, 536.76it/s, op: create_initial_integer]
--------------------------------------------------
Testing 10000 rules: 5005it [00:20, 249.97it/s, op: rule_8048_mul]
--------------------------------------------------
Testing 50000 rules: 4847it [01:22, 59.08it/s, op: rule_28153_sub]
==================================================
✅ Performance Scaling Summary
==================================================
# of Rules | Iterations / sec
----------------+---------------------
10 | 3,539.81
50 | 2,017.09
100 | 1,050.94
150 | 778.81
200 | 804.99
1000 | 539.57
10000 | 254.66
50000 | 60.39
--------------------------------------------------
(The actual use-case of this is avoiding the use of draw when I want dynamic / dependently-typed strategies, which I want for better printing of hypothesis failures. I can go implement this a different way, but I figured I'd report this.)
I'm not particularly worried about the performance of stateful tests with >100 rules (it's not a use case we design around, though if you've found it useful I'd be interested to hear details), but I still appreciate this report, because this scaling is dramatically worse than I would hope for.
I think the culprit here will be using sampled_from to choose the next rule + using a different .filter every single iteration to select the swarm-enabled rules, preventing caching. It may be that this is not feasible to optimize. We should still take a look.
The benchmark baseline rules are doing ~0 work, so the slowdown here would be less visible in any real project.
The benchmark baseline rules are doing ~0 work, so the slowdown here would be less visible in any real project.
I encountered this when attempting to test random functions in sympy, and the slowdown was quite visible (somewhere between 2x and 10x if I recall correctly -- which I guess you could indeed call "less visibly").
Yeah I didn't mean to suggest this has zero/low impact, my statement is trivially true so maybe I shouldn't have even mentioned it 😄