pyjanitor
pyjanitor copied to clipboard
Add the ability to test whether data is in range in a pandas data frame
Brief Description
Following up on #703, this issue seeks to introduce the ability to test whether each column in a data frame is within a user-defined range of values.
I would like to propose..
Example API
def flag_values_not_in_range(
df: pd.DataFrame,
bound: list,
inclusive: bool = True
) -> pd.DataFrame:
"""
:param df: data frame to test if values in columns are within a range
:param bound: user-defined lower and upper bound of values to test in data frame columns
:param inclusive: include boundaries in filtering.
Defaults to True
:return: True if value in column does not fall within range.
False if value in column falls within range.
"""
Based on my test, approach -1 seems to be more efficient, welcome any other feedback from the package leads:
import pandas as pd
import numpy as np
from time import time
from random import random
# Build a random data set
n_elements = 10**7
# ts_index = pd.date_range('1/1/2000', periods=1000, freq='T')
v1 = [random() for i in range(n_elements)]
v2 = [random() for j in range(n_elements)]
v3 = [random() for k in range(n_elements)]
v4 = [random() for l in range(n_elements)]
v5 = [random() for n in range(n_elements)]
v6 = [random() for m in range(n_elements)]
df = pd.DataFrame({'v1':v1, 'v2':v2, 'v3':v3, 'v4':v4, 'v5':v5, 'v6':v6})
df1 = df.copy(deep=True)
df2 = df.copy(deep=True)
n_columns = df.shape[1]
# Define random values for range
lower_bound = 20
upper_bound = 1000
inclusive = True
columns = df.columns
# Approach - 1
approach_1_timer_start = time()
for column in columns:
df[column + '_flag'] = df[column].between(left=lower_bound, right=upper_bound, inclusive=inclusive)
approach_1_timer_stop = time()
approach_1_time = round(approach_1_timer_stop - approach_1_timer_start, 2)
# Approach - 2
approach_2_timer_start = time()
for column in columns:
if inclusive:
conditions = [(df[column] >= lower_bound) & (df[column] <= upper_bound)]
else:
conditions = [(df[column] > lower_bound) & (df[column] < upper_bound)]
choices = [True]
df1[column + '_flag'] = np.select(conditions, choices, default=False)
approach_2_timer_stop = time()
approach_2_time = round(approach_2_timer_stop - approach_2_timer_start, 2)
# Approach - 3
approach_3_timer_start = time()
for (column_name, column_data) in df.iteritems():
df2[column_name + '_flag'] = column_data.between(left=lower_bound, right=upper_bound, inclusive=inclusive)
approach_3_timer_stop = time()
approach_3_time = round(approach_3_timer_stop - approach_3_timer_start, 2)
print(f""" Time taken by approach 1 : {approach_1_time} """)
print(f""" Time taken by approach 2 : {approach_2_time} """)
print(f""" Time taken by approach 3 : {approach_3_time} """)
def flag_values_not_in_range(
df: pd.DataFrame,
bound: list,
inclusive: bool = True
) -> pd.DataFrame:
"""
:param df: data frame to test if values in columns are within a range
:param bound: user-defined lower and upper bound of values to test in data frame columns
:param inclusive: include boundaries in filtering.
Defaults to True
:return: True if value in column does not fall within range.
False if value in column falls within range.
"""
# Get names of the columns into a list
columns = df.columns
# Make a copy to avoid mutation of the source dataframe
df1 = df.copy(deep=True)
# Loop through the columns and create new flags
for column in columns:
df1[column + '_range_flag'] = df1[column].between(
left=bound[0],
right=bound[1],
inclusive=inclusive
)
# Return the result
return df1
Thanks @UGuntupalli! Very comprehensive sketch. I'll give the issue a deeper look after work today!