whitebox
whitebox copied to clipboard
Perform global explainability on the inference dataset
As for now, the pipeline performs expainability per inference row. Explainability for the whole inference dataset maybe be useful in the future. Some related code has already be written:
- Pipeline
def create_xai_pipeline_classification_per_inference_dataset(training_set: pd.DataFrame, target: str, inference_set: pd.DataFrame, type_of_task: str, load_from_path = None
)-> Dict[str, Dict[str, float]]:
xai_dataset=training_set.drop(columns=[target])
explainability_report={}
# Make a mapping dict which will be used lated to map the explainer index
# with the features names
mapping_dict={}
for feature in range (0,len(xai_dataset.columns.tolist())):
mapping_dict[feature]=xai_dataset.columns.tolist()[feature]
# Expainability for both classifications tasks
# We have again to revisit here in the future as in case we upload the model
# from the file system we don't care if it is binary or multiclass
if type_of_task=='multiclass_classification':
# Giving the option of retrieving the local model
if load_from_path != None:
model = joblib.load('{}/lgb_multi.pkl'.format(load_from_path))
else:
model, eval = create_multiclass_classification_training_model_pipeline(training_set, target)
explainer = lime.lime_tabular.LimeTabularExplainer(xai_dataset.values, feature_names=xai_dataset.columns.values.tolist(), mode="classification",random_state=1)
for inference_row in range(0,len(inference_set)):
exp = explainer.explain_instance(inference_set.values[inference_row], model.predict)
med_report=exp.as_map()
temp_dict = dict(list(med_report.values())[0])
map_dict = {mapping_dict[name]: val for name, val in temp_dict.items()}
explainability_report["row{}".format(inference_row)]= map_dict
elif type_of_task=='binary_classification':
# Giving the option of retrieving the local model
if load_from_path != None:
model = joblib.load('{}/lgb_binary.pkl'.format(load_from_path))
else:
model, eval = create_binary_classification_training_model_pipeline(training_set, target)
explainer = lime.lime_tabular.LimeTabularExplainer(xai_dataset.values, feature_names=xai_dataset.columns.values.tolist(), mode="classification",random_state=1)
for inference_row in range(0,len(inference_set)):
exp = explainer.explain_instance(inference_set.values[inference_row], model.predict_proba)
med_report=exp.as_map()
temp_dict = dict(list(med_report.values())[0])
map_dict = {mapping_dict[name]: val for name, val in temp_dict.items()}
explainability_report["row{}".format(inference_row)]= map_dict
return explainability_report
- Unit tests
def test_create_xai_pipeline_classification_per_inference_dataset(self):
binary_class_report =create_xai_pipeline_classification(df_binary,"target",df_binary_inference,"binary_classification")
multi_class_report=create_xai_pipeline_classification(df_multi,"target",df_multi_inference,"multiclass_classification")
binary_contribution_check_one = binary_class_report["row0"]["worst perimeter"]
binary_contribution_check_two = binary_class_report["row2"]['worst texture']
multi_contribution_check_one = multi_class_report["row0"]["hue"]
multi_contribution_check_two = multi_class_report["row9"]["proanthocyanins"]
assert (len(binary_class_report)) == len(df_binary_inference)
assert (len(multi_class_report)) == len(df_multi_inference)
assert (round(binary_contribution_check_one, 3)) == 0.253
assert (round(binary_contribution_check_two, 2)) == -0.09
assert (round(multi_contribution_check_one, 2)) == -0.08
assert (round(multi_contribution_check_two, 3)) == -0.023
@momegas I would like to start working on this
That would be great. Please start by opening a PR with the proposed changes (draft). Ill assign this to you. Thanks 🙏