evidently icon indicating copy to clipboard operation
evidently copied to clipboard

Not used columns should not affect calculating Categorical Target Drift

Open danieljmv01 opened this issue 2 years ago • 0 comments

CatTargetDriftAnalyzer only uses the target or the prediction columns, but it drops rows that contains NaNs in any column, even if that column is not used in this analyzer.

Also if any used column is empty after dropping invalid values, the exception raised is not informative at all.

I added a test to replicate the problem, here is the pytest output:

______________________ test_structure_no_drift_with_nulls ______________________

analyzer = <evidently.analyzers.cat_target_drift_analyzer.CatTargetDriftAnalyzer object at 0x7f88ccf6af98>

    def test_drift_with_null_colums(analyzer: CatTargetDriftAnalyzer) -> None:
        """Test drift with columns with nulls.
    
        Test that not used columns with nulls does not change
        target drift.
        """
        data = {
            "target": ["a"] * 10 + ["b"] * 10,
            "foo": [1]*10 + [np.nan] * 10,
            "bar": [np.nan] * 10 + [1] * 10,
        }
        df1 = DataFrame(data)
        df2 = DataFrame(data)
    
>       result = analyzer.calculate(df1, df2, ColumnMapping())

tests/analyzers/test_categorical_target_drift_analyzer.py:217: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
src/evidently/analyzers/cat_target_drift_analyzer.py:154: in calculate
    reference_data, current_data, feature_type, target_column, target_test, threshold
src/evidently/analyzers/cat_target_drift_analyzer.py:38: in _compute_statistic
    return stattest(reference_data[column_name], current_data[column_name], feature_type, threshold)
src/evidently/analyzers/stattests/registry.py:28: in __call__
    self.default_threshold if threshold is None else threshold)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

reference_data = Series([], Name: target, dtype: object)
current_data = Series([], Name: target, dtype: object), feature_type = 'cat'
threshold = 0.05

    def _z_stat_test(
            reference_data: pd.Series,
            current_data: pd.Series,
            feature_type: str,
            threshold: float) -> Tuple[float, bool]:
        #  TODO: simplify ignoring NaN values here, in chi_stat_test and data_drift_analyzer
        if (reference_data.nunique() == 1
                and current_data.nunique() == 1
                and reference_data.unique()[0] == current_data.unique()[0]):
            p_value = 1
        else:
            keys = set(list(reference_data.unique()) + list(current_data.unique())) - {np.nan}
            ordered_keys = sorted(list(keys))
            p_value = proportions_diff_z_test(
                proportions_diff_z_stat_ind(
>                   reference_data.apply(lambda x, key=ordered_keys[0]: 0 if x == key else 1),
                    current_data.apply(lambda x, key=ordered_keys[0]: 0 if x == key else 1)
                )
            )
E           IndexError: list index out of range

danieljmv01 avatar May 30 '22 21:05 danieljmv01