Hyperactive
Hyperactive copied to clipboard
⚡️ Speed up method `HyperactiveSearchCV._refit` by 25%
📄 25% (0.25x) speedup for HyperactiveSearchCV._refit in src/hyperactive/integrations/sklearn/hyperactive_search_cv.py
⏱️ Runtime : 209 milliseconds → 167 milliseconds (best of 15 runs)
📝 Explanation and details
To optimize the HyperactiveSearchCV class for better performance, we can make some adjustments focusing on the initialization logic and the _refit method. Here are the changes.
- Instead of using
clonetwice in the_refitmethod, we can use theset_paramsmethod directly on the cloned estimator. - Making sure the parameters used in
set_paramsare minimal and exclude unnecessary cloning. - Using NumPy where applicable for potential performance improvements, although this snippet might not be the best place for enhancements using NumPy directly.
Let's see the updated code snippet with these optimizations.
Comments on the Changes.
- Removed redundant
cloneoperation in_refit. - Directly set parameters using
set_paramsmethod to minimize overhead and unnecessary object creation.
This ensures the re-fitting process is more efficient and reduces the computational overhead. If the program as a whole has other sections that deal with I/O or can use parallel processing more efficiently (like the parallel jobs indicated by n_jobs), those would be the next target for optimization.
✅ Correctness verification report:
| Test | Status |
|---|---|
| ⚙️ Existing Unit Tests | 🔘 None Found |
| 🌀 Generated Regression Tests | ✅ 39 Passed |
| ⏪ Replay Tests | 🔘 None Found |
| 🔎 Concolic Coverage Tests | 🔘 None Found |
| 📊 Tests Coverage | 100.0% |
🌀 Generated Regression Tests Details
from typing import Callable, Dict, Type, Union
import numpy as np
# imports
import pytest # used for our unit tests
from hyperactive.integrations.sklearn.hyperactive_search_cv import \
HyperactiveSearchCV
from sklearn.base import BaseEstimator, clone
from sklearn.linear_model import LogisticRegression
# unit tests
# Basic Functionality
def test_refit_standard_case():
X = np.array([[1, 2], [3, 4], [5, 6]])
y = np.array([0, 1, 0])
estimator = LogisticRegression()
params_config = {'C': [0.1, 1, 10]}
search = HyperactiveSearchCV(estimator, params_config)
search.best_params_ = {'C': 1}
search._refit(X, y)
# Edge Cases
def test_refit_empty_dataset():
X = np.array([])
y = np.array([])
estimator = LogisticRegression()
params_config = {'C': [0.1, 1, 10]}
search = HyperactiveSearchCV(estimator, params_config)
search.best_params_ = {'C': 1}
with pytest.raises(ValueError):
search._refit(X, y)
def test_refit_single_sample():
X = np.array([[1, 2]])
y = np.array([0])
estimator = LogisticRegression()
params_config = {'C': [0.1, 1, 10]}
search = HyperactiveSearchCV(estimator, params_config)
search.best_params_ = {'C': 1}
search._refit(X, y)
def test_refit_single_feature():
X = np.array([[1], [2], [3]])
y = np.array([0, 1, 0])
estimator = LogisticRegression()
params_config = {'C': [0.1, 1, 10]}
search = HyperactiveSearchCV(estimator, params_config)
search.best_params_ = {'C': 1}
search._refit(X, y)
def test_refit_single_class():
X = np.array([[1, 2], [3, 4], [5, 6]])
y = np.array([0, 0, 0])
estimator = LogisticRegression()
params_config = {'C': [0.1, 1, 10]}
search = HyperactiveSearchCV(estimator, params_config)
search.best_params_ = {'C': 1}
with pytest.raises(ValueError):
search._refit(X, y)
# Invalid Inputs
def test_refit_incorrect_data_types():
X = [[1, 2], [3, 4], [5, 6]] # List instead of numpy array
y = "invalid" # String instead of array-like
estimator = LogisticRegression()
params_config = {'C': [0.1, 1, 10]}
search = HyperactiveSearchCV(estimator, params_config)
search.best_params_ = {'C': 1}
with pytest.raises(TypeError):
search._refit(X, y)
def test_refit_mismatched_dimensions():
X = np.array([[1, 2], [3, 4], [5, 6]])
y = np.array([0, 1]) # Mismatched length
estimator = LogisticRegression()
params_config = {'C': [0.1, 1, 10]}
search = HyperactiveSearchCV(estimator, params_config)
search.best_params_ = {'C': 1}
with pytest.raises(ValueError):
search._refit(X, y)
# Hyperparameter Configurations
def test_refit_no_hyperparameters():
X = np.array([[1, 2], [3, 4], [5, 6]])
y = np.array([0, 1, 0])
estimator = LogisticRegression()
params_config = {}
search = HyperactiveSearchCV(estimator, params_config)
search.best_params_ = {}
search._refit(X, y)
def test_refit_invalid_hyperparameters():
X = np.array([[1, 2], [3, 4], [5, 6]])
y = np.array([0, 1, 0])
estimator = LogisticRegression()
params_config = {'invalid_param': [0.1, 1, 10]}
search = HyperactiveSearchCV(estimator, params_config)
search.best_params_ = {'invalid_param': 1}
with pytest.raises(ValueError):
search._refit(X, y)
# Performance and Scalability
def test_refit_large_dataset():
X = np.random.rand(10000, 100)
y = np.random.randint(0, 2, 10000)
estimator = LogisticRegression()
params_config = {'C': [0.1, 1, 10]}
search = HyperactiveSearchCV(estimator, params_config)
search.best_params_ = {'C': 1}
search._refit(X, y)
def test_refit_high_dimensional_data():
X = np.random.rand(10, 10000)
y = np.random.randint(0, 2, 10)
estimator = LogisticRegression()
params_config = {'C': [0.1, 1, 10]}
search = HyperactiveSearchCV(estimator, params_config)
search.best_params_ = {'C': 1}
search._refit(X, y)
# Special Cases
def test_refit_custom_fit_params():
X = np.array([[1, 2], [3, 4], [5, 6]])
y = np.array([0, 1, 0])
estimator = LogisticRegression()
params_config = {'C': [0.1, 1, 10]}
search = HyperactiveSearchCV(estimator, params_config)
search.best_params_ = {'C': 1}
search._refit(X, y, sample_weight=np.array([1, 1, 1]))
# Integration with Cross-Validation
def test_refit_random_state():
X = np.array([[1, 2], [3, 4], [5, 6]])
y = np.array([0, 1, 0])
estimator = LogisticRegression()
params_config = {'C': [0.1, 1, 10]}
search = HyperactiveSearchCV(estimator, params_config, random_state=42)
search.best_params_ = {'C': 1}
search._refit(X, y)
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
from typing import Callable, Dict, Type, Union
import numpy as np
# imports
import pytest # used for our unit tests
from hyperactive.integrations.sklearn.hyperactive_search_cv import \
HyperactiveSearchCV
from sklearn.base import BaseEstimator, clone
# Mock Estimator for testing
class MockEstimator(BaseEstimator):
def __init__(self, param=1):
self.param = param
def fit(self, X, y=None, **fit_params):
self.is_fitted_ = True
return self
from hyperactive.integrations.sklearn.hyperactive_search_cv import \
HyperactiveSearchCV
# unit tests
# Basic Functionality
def test_refit_basic():
model = HyperactiveSearchCV(estimator=MockEstimator(), params_config={'param': [1, 2, 3]})
model.best_params_ = {'param': 2}
X = np.array([[1, 2], [3, 4], [5, 6]])
y = np.array([1, 2, 3])
model._refit(X, y)
# Edge Cases
def test_refit_single_data_point():
model = HyperactiveSearchCV(estimator=MockEstimator(), params_config={'param': [1, 2, 3]})
model.best_params_ = {'param': 2}
X = np.array([[1, 2]])
y = np.array([1])
model._refit(X, y)
def test_refit_invalid_params():
model = HyperactiveSearchCV(estimator=MockEstimator(), params_config={'param': [1, 2, 3]})
model.best_params_ = {'invalid_param': 2}
X = np.array([[1, 2], [3, 4], [5, 6]])
y = np.array([1, 2, 3])
with pytest.raises(ValueError):
model._refit(X, y)
def test_refit_missing_params():
model = HyperactiveSearchCV(estimator=MockEstimator(), params_config={'param': [1, 2, 3]})
model.best_params_ = None
X = np.array([[1, 2], [3, 4], [5, 6]])
y = np.array([1, 2, 3])
with pytest.raises(TypeError):
model._refit(X, y)
# Additional Fit Parameters
def test_refit_with_additional_fit_params():
model = HyperactiveSearchCV(estimator=MockEstimator(), params_config={'param': [1, 2, 3]})
model.best_params_ = {'param': 2}
X = np.array([[1, 2], [3, 4], [5, 6]])
y = np.array([1, 2, 3])
model._refit(X, y, sample_weight=[1, 1, 1])
# Large Scale Test Cases
def test_refit_large_scale():
model = HyperactiveSearchCV(estimator=MockEstimator(), params_config={'param': [1, 2, 3]})
model.best_params_ = {'param': 2}
X = np.random.rand(1000, 10)
y = np.random.randint(0, 2, 1000)
model._refit(X, y)
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
To edit these changes git checkout codeflash/optimize-HyperactiveSearchCV._refit-m8ey8ag3 and push.
@fkiraly, I thought also cloning the parameters was necessary or at least more thorough. But we can remove it, if it is not necessary.
@fkiraly, I thought also cloning the parameters was necessary or at least more thorough.
It sounds logical if you think about it from first-principles! Though I have not seen it before. I am not sure how to resolve this contradiction for now, but I do agree it seems incongruent.
What I am wondering about, are these numbers accurate? We are just cloning a dict, or passing it, that should not take 0.2 seconds.
The runtime number is the sum of the individual runtime of all the test cases that are attached in the PR description.
So we would not know, for instance, if this is just because the new code triggers less exceptions, where exceptions could be sensible? Exception traceback generation takes substantial runtime.