evalml icon indicating copy to clipboard operation
evalml copied to clipboard

Raise an error if an the target column has an invalid logical type

Open gsheni opened this issue 2 years ago • 0 comments
trafficstars

  • As a user of EvalML, I expect EvalML to check the Logical Type of my target column to determine if it is valid.
    • If it is not valid, but can be cast to the correct type, I expect EvalML to change the Logical type.
    • If it is not valid, and cannot be cast to the correct type, I expect EvalML to raise an error.
def check_target_logical_type(y, problem_type):
    if problem_type in [
        ProblemTypes.REGRESSION,
        ProblemTypes.TIME_SERIES_REGRESSION,
    ] and not any(
        isinstance(y.ww.schema.logical_type, x)
        for x in [
            Integer,
            IntegerNullable,
            Double,
        ]
    ):
        raise ValueError(
            "Regression problem type requires a Integer, IntegerNullable or Double target",
        )
    elif problem_type == ProblemTypes.MULTICLASS and not isinstance(
        y.ww.schema.logical_type,
        Categorical,
    ):
        y = y.ww.set_logical_type("Categorical")
    elif problem_type == ProblemTypes.BINARY and not any(
        isinstance(y.ww.schema.logical_type, x)
        for x in [
            Boolean,
            BooleanNullable,
            Categorical,
        ]
    ):
        raise ValueError(
            "Binary problem type requires a Boolean, BooleanNullable or Categorical target",
        )
    return y

Tests

def test_check_target_logical_type():
    y = pd.Series([1, 2, 2, 3, 3, 1], dtype="int64")
    y.ww.init(logical_type="Integer")
    check_target_logical_type(y, ProblemTypes.REGRESSION)
    check_target_logical_type(y, ProblemTypes.TIME_SERIES_REGRESSION)

    with pytest.raises(ValueError, match="Binary problem type requires a"):
        check_target_logical_type(y, ProblemTypes.BINARY)
    new_y = check_target_logical_type(y, ProblemTypes.MULTICLASS)
    assert new_y.ww.schema.logical_type.__class__ == Categorical

    y = pd.Series(["red", "blue", "blue"], dtype="category")
    y.ww.init(logical_type="Categorical")
    with pytest.raises(ValueError, match="Regression problem type requires a"):
        check_target_logical_type(y, ProblemTypes.REGRESSION)

gsheni avatar Mar 28 '23 18:03 gsheni