evidently
evidently copied to clipboard
Not used columns should not affect calculating Categorical Target Drift
CatTargetDriftAnalyzer
only uses the target or the prediction columns, but it drops rows that contains NaNs in any column, even if that column is not used in this analyzer.
Also if any used column is empty after dropping invalid values, the exception raised is not informative at all.
I added a test to replicate the problem, here is the pytest output:
______________________ test_structure_no_drift_with_nulls ______________________
analyzer = <evidently.analyzers.cat_target_drift_analyzer.CatTargetDriftAnalyzer object at 0x7f88ccf6af98>
def test_drift_with_null_colums(analyzer: CatTargetDriftAnalyzer) -> None:
"""Test drift with columns with nulls.
Test that not used columns with nulls does not change
target drift.
"""
data = {
"target": ["a"] * 10 + ["b"] * 10,
"foo": [1]*10 + [np.nan] * 10,
"bar": [np.nan] * 10 + [1] * 10,
}
df1 = DataFrame(data)
df2 = DataFrame(data)
> result = analyzer.calculate(df1, df2, ColumnMapping())
tests/analyzers/test_categorical_target_drift_analyzer.py:217:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
src/evidently/analyzers/cat_target_drift_analyzer.py:154: in calculate
reference_data, current_data, feature_type, target_column, target_test, threshold
src/evidently/analyzers/cat_target_drift_analyzer.py:38: in _compute_statistic
return stattest(reference_data[column_name], current_data[column_name], feature_type, threshold)
src/evidently/analyzers/stattests/registry.py:28: in __call__
self.default_threshold if threshold is None else threshold)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
reference_data = Series([], Name: target, dtype: object)
current_data = Series([], Name: target, dtype: object), feature_type = 'cat'
threshold = 0.05
def _z_stat_test(
reference_data: pd.Series,
current_data: pd.Series,
feature_type: str,
threshold: float) -> Tuple[float, bool]:
# TODO: simplify ignoring NaN values here, in chi_stat_test and data_drift_analyzer
if (reference_data.nunique() == 1
and current_data.nunique() == 1
and reference_data.unique()[0] == current_data.unique()[0]):
p_value = 1
else:
keys = set(list(reference_data.unique()) + list(current_data.unique())) - {np.nan}
ordered_keys = sorted(list(keys))
p_value = proportions_diff_z_test(
proportions_diff_z_stat_ind(
> reference_data.apply(lambda x, key=ordered_keys[0]: 0 if x == key else 1),
current_data.apply(lambda x, key=ordered_keys[0]: 0 if x == key else 1)
)
)
E IndexError: list index out of range