hypothesis performance of run_state_machine_as_test drops drastically as a function of the number of state machine rules

Here's a synthetic example demonstrating the performance issue:

#!/usr/bin/env python

import time
import operator

import hypothesis.strategies as st
from hypothesis import settings, Verbosity
from hypothesis.stateful import (
    Bundle,
    RuleBasedStateMachine,
    rule,
    run_state_machine_as_test,
)
from tqdm.auto import tqdm

def run_test(num_dynamic_rules: int, num_examples: int):
    """
    Defines, configures, and runs a state machine test for a given
    number of rules and examples.

    Returns:
        float: The measured iterations per second.
    """
    # A new TQDM progress bar for this specific run
    pbar = tqdm(total=num_examples, desc=f"Testing {num_dynamic_rules} rules")
    count = 0
    def inc():
        nonlocal count
        count += 1

    # To ensure test isolation, the StateMachine class is defined *inside*
    # this function. This creates a new, clean class for each run.
    class DynamicRuleMachine(RuleBasedStateMachine):
        """
        A state machine that is dynamically configured for each test run.
        """

        values = Bundle("values")

        def __init__(self):
            super().__init__()
            # self.internal_state = []
            # Associate this machine instance with the correct progress bar
            self.pbar = pbar
            # Reset progress bar for each new state machine instance
            # if self.pbar.n > 0:
                # self.pbar.reset()

        def _update_pbar(self, op_name: str):
            """Helper to update the TQDM progress bar."""
            self.pbar.set_postfix_str(f"op: {op_name}")
            self.pbar.update(1)

        @rule(target=values, n=st.integers(min_value=-100, max_value=100))
        def create_initial_integer(self, n):
            self._update_pbar("create_initial_integer")
            inc()
            # self.internal_state.append(n)
            return n

        @classmethod
        def add_dynamic_rules(cls, num_rules_to_add: int):
            """Dynamically creates and attaches new rules to this class."""
            operations = [operator.add, operator.sub, operator.mul]
            for i in range(num_rules_to_add):
                op = operations[i % len(operations)]
                rule_method = cls._create_rule_method(op, i)
                rule_name = f"dynamic_rule_{i}_{op.__name__}"
                setattr(cls, rule_name, rule_method)

        @classmethod
        def _create_rule_method(cls, op, rule_index):
            """Factory to create a rule method with the correct scope."""

            def dynamic_rule_body(self, v1, v2):
                op_name = f"rule_{rule_index}_{op.__name__}"
                self._update_pbar(op_name)
                inc()
                result = op(v1, v2)
                # self.internal_state.append(result)
                return result

            return rule(target=cls.values, v1=cls.values, v2=cls.values)(
                dynamic_rule_body
            )

    # --- Test Execution for this configuration ---

    # 1. Add the specified number of rules to the fresh class
    DynamicRuleMachine.add_dynamic_rules(num_dynamic_rules)

    # 2. Configure Hypothesis settings
    hypo_settings = settings(
        max_examples=num_examples,
        deadline=None,
        verbosity=Verbosity.quiet,  # Keep the output clean during the run
    )

    # 3. Run the state machine test and measure time
    start_time = time.perf_counter()
    run_state_machine_as_test(DynamicRuleMachine, settings=hypo_settings)
    end_time = time.perf_counter()
    pbar.close()

    total_time = end_time - start_time
    return count / total_time if total_time > 0 else float("inf")


def main():
    """
    Orchestrates multiple test runs to show how performance scales with the
    number of rules and prints a summary report.
    """
    # Define the configurations to test
    rule_counts = [10, 50, 100, 150, 200, 1000, 10_000, 50_000]
    N_EXAMPLES_PER_RUN = 100  # Use a consistent number for a fair comparison

    results = []

    print(f"🚀 Starting performance scaling tests for Hypothesis...")
    print(f"Each test will run for {N_EXAMPLES_PER_RUN:,} examples.")

    for n_rules in rule_counts:
        print("-" * 50)
        iter_per_sec = run_test(
            num_dynamic_rules=n_rules, num_examples=N_EXAMPLES_PER_RUN
        )
        results.append({"rules": n_rules, "iter_p_s": iter_per_sec})

    # --- Final Report ---
    print("\n" + "=" * 50)
    print("✅ Performance Scaling Summary")
    print("=" * 50)
    print(f"{'# of Rules':<15} | {'Iterations / sec':<20}")
    print(f"{'-'*15:}-+-{'-'*20:}")
    for res in results:
        print(f"{res['rules']:<15} | {res['iter_p_s']:>20,.2f}")
    print("-" * 50)


if __name__ == "__main__":
    main()

When I run this, I see:

🚀 Starting performance scaling tests for Hypothesis...
Each test will run for 100 examples.
--------------------------------------------------
Testing 10 rules: 5000it [00:01, 3536.33it/s, op: create_initial_integer]                                                                                                                                                                                                                                                                                                                                                                         
--------------------------------------------------
Testing 50 rules: 4970it [00:02, 2015.75it/s, op: create_initial_integer]                                                                                                                                                                                                                                                                                                                                                                         
--------------------------------------------------
Testing 100 rules: 5000it [00:04, 1050.11it/s, op: rule_29_mul]                                                                                                                                                                                                                                                                                                                                                                                   
--------------------------------------------------
Testing 150 rules: 4506it [00:05, 774.94it/s, op: rule_108_add]                                                                                                                                                                                                                                                                                                                                                                                   
--------------------------------------------------
Testing 200 rules: 4526it [00:05, 803.98it/s, op: rule_156_add]                                                                                                                                                                                                                                                                                                                                                                                   
--------------------------------------------------
Testing 1000 rules: 3386it [00:06, 536.76it/s, op: create_initial_integer]                                                                                                                                                                                                                                                                                                                                                                        
--------------------------------------------------
Testing 10000 rules: 5005it [00:20, 249.97it/s, op: rule_8048_mul]                                                                                                                                                                                                                                                                                                                                                                                
--------------------------------------------------
Testing 50000 rules: 4847it [01:22, 59.08it/s, op: rule_28153_sub]                                                                                                                                                                                                                                                                                                                                                                                

==================================================
✅ Performance Scaling Summary
==================================================
# of Rules      | Iterations / sec    
----------------+---------------------
10              |             3,539.81
50              |             2,017.09
100             |             1,050.94
150             |               778.81
200             |               804.99
1000            |               539.57
10000           |               254.66
50000           |                60.39
--------------------------------------------------

(The actual use-case of this is avoiding the use of draw when I want dynamic / dependently-typed strategies, which I want for better printing of hypothesis failures. I can go implement this a different way, but I figured I'd report this.)

Jul 09 '25 02:07 JasonGross

I'm not particularly worried about the performance of stateful tests with >100 rules (it's not a use case we design around, though if you've found it useful I'd be interested to hear details), but I still appreciate this report, because this scaling is dramatically worse than I would hope for.

I think the culprit here will be using sampled_from to choose the next rule + using a different .filter every single iteration to select the swarm-enabled rules, preventing caching. It may be that this is not feasible to optimize. We should still take a look.

The benchmark baseline rules are doing ~0 work, so the slowdown here would be less visible in any real project.

Jul 12 '25 06:07 Liam-DeVoe

The benchmark baseline rules are doing ~0 work, so the slowdown here would be less visible in any real project.

I encountered this when attempting to test random functions in sympy, and the slowdown was quite visible (somewhere between 2x and 10x if I recall correctly -- which I guess you could indeed call "less visibly").

Jul 12 '25 08:07 JasonGross

Yeah I didn't mean to suggest this has zero/low impact, my statement is trivially true so maybe I shouldn't have even mentioned it 😄

Jul 12 '25 12:07 Liam-DeVoe