Trane icon indicating copy to clipboard operation
Trane copied to clipboard

Add threshold function that uses entropy to maximize uncertainty

Open gsheni opened this issue 1 year ago • 0 comments

    def find_threshold_to_maximize_uncertainty(
        self,
        df,
        label_col,
        entity_col,
        max_num_unique_values=10,
        max_number_of_rows=2000,
        random_state=None,
    ):
        original_threshold = self.threshold

        unique_vals = sample_unique_values(
            df[label_col],
            max_num_unique_values,
            random_state,
        )

        # if len(df) > max_number_of_rows:
        #     df = df.sample(max_number_of_rows, random_state=random_state)

        best_entropy = 0
        best_parameter_value = 0

        # return the one that results in the most entropy (contains the most randomness)
        # more entropy means more unpredictability
        # goal of ML is to reduce uncertainty
        # so we want to output the dataframe with the most entropy
        unique_vals = set(df[label_col])
        for unique_val in unique_vals:
            self.set_parameters(threshold=unique_val)

            output_df = df.groupby(entity_col).apply(self.label_function)
            current_entropy = entropy_of_list(output_df[label_col])

            if current_entropy > best_entropy:
                best_entropy = current_entropy
                best_parameter_value = unique_val

        self.set_parameters(threshold=original_threshold)
        return best_parameter_value
def test_find_threshold_to_maximize_uncertanity(df):
    op = GreaterFilterOp("col")
    op.set_parameters(threshold=30.0)
    best_parameter_value = op.find_threshold_to_maximize_uncertainty(
        df,
        label_col="col",
        entity_col="id",
        random_state=0,
        max_num_unique_values=2,
    )
    # 10 will keep most of the values in col and maximize unpredictability
    # 10 is the lowest number
    assert best_parameter_value == 10
    assert op.threshold == 30.0

gsheni avatar Jul 06 '23 05:07 gsheni