Trane
Trane copied to clipboard
Add threshold function that uses entropy to maximize uncertainty
def find_threshold_to_maximize_uncertainty(
self,
df,
label_col,
entity_col,
max_num_unique_values=10,
max_number_of_rows=2000,
random_state=None,
):
original_threshold = self.threshold
unique_vals = sample_unique_values(
df[label_col],
max_num_unique_values,
random_state,
)
# if len(df) > max_number_of_rows:
# df = df.sample(max_number_of_rows, random_state=random_state)
best_entropy = 0
best_parameter_value = 0
# return the one that results in the most entropy (contains the most randomness)
# more entropy means more unpredictability
# goal of ML is to reduce uncertainty
# so we want to output the dataframe with the most entropy
unique_vals = set(df[label_col])
for unique_val in unique_vals:
self.set_parameters(threshold=unique_val)
output_df = df.groupby(entity_col).apply(self.label_function)
current_entropy = entropy_of_list(output_df[label_col])
if current_entropy > best_entropy:
best_entropy = current_entropy
best_parameter_value = unique_val
self.set_parameters(threshold=original_threshold)
return best_parameter_value
def test_find_threshold_to_maximize_uncertanity(df):
op = GreaterFilterOp("col")
op.set_parameters(threshold=30.0)
best_parameter_value = op.find_threshold_to_maximize_uncertainty(
df,
label_col="col",
entity_col="id",
random_state=0,
max_num_unique_values=2,
)
# 10 will keep most of the values in col and maximize unpredictability
# 10 is the lowest number
assert best_parameter_value == 10
assert op.threshold == 30.0