feature_engine
feature_engine copied to clipboard
Draft of utilitiy functions
Hi, as stated in the course here you are the draft of the functions used several times in the course (I tried it and seems to work fine, but I haven't tested it completely):
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# for Q-Q plots
import scipy.stats as stats
def diagnostic_boxplots(df, variable, figsize=None):
''' function takes a dataframe (df)
the variable of interest as arguments
and optionally the figsize as tuple, default to (16,4) '''
# define figure size
if figsize == None:
figsize = (16,4)
plt.figure(figsize=figsize)
# histogram
plt.subplot(1, 3, 1)
sns.distplot(df[variable], bins=30)
plt.title('Histogram')
# Q-Q plot
plt.subplot(1, 3, 2)
stats.probplot(df[variable], dist="norm", plot=plt)
plt.ylabel('Variable quantiles')
# boxplot
plt.subplot(1, 3, 3)
sns.boxplot(y=df[variable])
plt.title('Boxplot')
plt.show()
# function to find upper and lower boundaries
# for normally distributed variables
def find_normal_boundaries(df, variable, distance=3):
''' calculate the boundaries outside which sit the outliers
for a Gaussian distribution given the distance, default 3 '''
upper_boundary = df[variable].mean() + distance * df[variable].std()
lower_boundary = df[variable].mean() - distance * df[variable].std()
return upper_boundary, lower_boundary
# function to find upper and lower boundaries
# for skewed distributed variables
def find_skewed_boundaries(df, variable, distance):
''' Let's calculate the boundaries outside which sit the outliers
for skewed distributions
distance passed as an argument, gives us the option to
estimate 1.5 times or 3 times the IQR to calculate
the boundaries.'''
IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)
lower_boundary = df[variable].quantile(0.25) - (IQR * distance)
upper_boundary = df[variable].quantile(0.75) + (IQR * distance)
return upper_boundary, lower_boundary
def diagnostic_plots(df, variable, figsize=None):
''' function to plot a histogram and a Q-Q plot
side by side, for a certain variable
optionally with figsize as tuple
default to (15,6) '''
if figsize == None:
figsize = (15,6)
plt.figure(figsize=figsize)
plt.subplot(1, 2, 1)
df[variable].hist(bins=30)
plt.subplot(1, 2, 2)
stats.probplot(df[variable], dist="norm", plot=plt)
plt.show()
def set_boundaries(df, variable, upper_limit=None, lower_limit = None):
''' Set the boundaries, one or both, for the specific variable in the
DataFrame (df). '''
if (upper_limit == None and lower_limit == None):
return df[variable]
elif lower_limit == None:
return np.where(df[variable] > upper_limit, upper_limit, df[variable])
elif upper_limit == None:
return np.where(df[variable] < lower_limit, lower_limit, df[variable])
else:
return np.where(df[variable] > upper_limit, upper_limit,
np.where(df[variable] < lower_limit, lower_limit, df[variable]))
def flag_boundaries(df, variable, upper_limit=None, lower_limit = None):
''' Flag the boundaries, one or both, for the specific variable in the
DataFrame (df). '''
if (upper_limit == None and lower_limit == None):
return False
elif lower_limit == None:
return np.where(df[variable] > upper_limit, True, False)
elif upper_limit == None:
return np.where(df[variable] < lower_limit, True, False)
else:
np.where(df[variable] > upper_limit, True,
np.where(df[variable] < lower_limit, True, False))
def find_quantile_boundaries(df, variable, lower_quantile=0.05, upper_quantile=0.95):
''' Returns the boundaries as the quantiles, default to 0.05 and 0.95 '''
lower_boundary = df[variable].quantile(lower_quantile)
upper_boundary = df[variable].quantile(upper_quantile)
return upper_boundary, lower_boundary
Nothing new but they look convenient to me.
thank you!