feature_engine icon indicating copy to clipboard operation
feature_engine copied to clipboard

Draft of utilitiy functions

Open adalseno opened this issue 4 years ago • 1 comments

Hi, as stated in the course here you are the draft of the functions used several times in the course (I tried it and seems to work fine, but I haven't tested it completely):

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# for Q-Q plots
import scipy.stats as stats



def diagnostic_boxplots(df, variable, figsize=None):
    ''' function takes a dataframe (df) 
     the variable of interest as arguments
     and optionally the figsize as tuple, default to (16,4) '''

    # define figure size
    if figsize == None:
        figsize = (16,4)
    plt.figure(figsize=figsize)

    # histogram
    plt.subplot(1, 3, 1)
    sns.distplot(df[variable], bins=30)
    plt.title('Histogram')

    # Q-Q plot
    plt.subplot(1, 3, 2)
    stats.probplot(df[variable], dist="norm", plot=plt)
    plt.ylabel('Variable quantiles')

    # boxplot
    plt.subplot(1, 3, 3)
    sns.boxplot(y=df[variable])
    plt.title('Boxplot')

    plt.show()



# function to find upper and lower boundaries
# for normally distributed variables


def find_normal_boundaries(df, variable, distance=3):

    ''' calculate the boundaries outside which sit the outliers
     for a Gaussian distribution given the distance, default 3 '''

    upper_boundary = df[variable].mean() + distance * df[variable].std()
    lower_boundary = df[variable].mean() - distance * df[variable].std()

    return upper_boundary, lower_boundary


# function to find upper and lower boundaries
# for skewed distributed variables


def find_skewed_boundaries(df, variable, distance):

    ''' Let's calculate the boundaries outside which sit the outliers
     for skewed distributions

     distance passed as an argument, gives us the option to
     estimate 1.5 times or 3 times the IQR to calculate
     the boundaries.'''

    IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)

    lower_boundary = df[variable].quantile(0.25) - (IQR * distance)
    upper_boundary = df[variable].quantile(0.75) + (IQR * distance)

    return upper_boundary, lower_boundary


def diagnostic_plots(df, variable, figsize=None):
    
    ''' function to plot a histogram and a Q-Q plot
    side by side, for a certain variable
    optionally with figsize as tuple
    default to (15,6) '''

    if figsize == None:
        figsize = (15,6)
    plt.figure(figsize=figsize)
    plt.subplot(1, 2, 1)
    df[variable].hist(bins=30)

    plt.subplot(1, 2, 2)
    stats.probplot(df[variable], dist="norm", plot=plt)

    plt.show()


def set_boundaries(df, variable, upper_limit=None, lower_limit = None):
    ''' Set the boundaries, one or both, for the specific variable in the 
    DataFrame (df). '''

    if (upper_limit == None and lower_limit == None):
        return df[variable]
    elif lower_limit == None:
        return np.where(df[variable] > upper_limit, upper_limit, df[variable])
    elif upper_limit == None:
        return np.where(df[variable] < lower_limit, lower_limit, df[variable])
    else:
        return np.where(df[variable] > upper_limit, upper_limit,
                       np.where(df[variable] < lower_limit, lower_limit, df[variable]))


def flag_boundaries(df, variable, upper_limit=None, lower_limit = None):
    ''' Flag the boundaries, one or both, for the specific variable in the 
    DataFrame (df). '''
    if (upper_limit == None and lower_limit == None):
        return False
    elif lower_limit == None:
        return np.where(df[variable] > upper_limit, True, False)   
    elif upper_limit == None:
        return np.where(df[variable] < lower_limit, True, False) 
    else:
        np.where(df[variable] > upper_limit, True,
                       np.where(df[variable] < lower_limit, True, False))




def find_quantile_boundaries(df, variable, lower_quantile=0.05, upper_quantile=0.95):

    ''' Returns  the boundaries as the quantiles, default to 0.05 and 0.95 ''' 

    lower_boundary = df[variable].quantile(lower_quantile)
    upper_boundary = df[variable].quantile(upper_quantile)

    return upper_boundary, lower_boundary

Nothing new but they look convenient to me.

adalseno avatar Jun 22 '20 21:06 adalseno

thank you!

solegalli avatar Jun 23 '20 10:06 solegalli