featuretools
featuretools copied to clipboard
Investigate `_dataframes_equal` for EntitySet equality
Currently EntitySet.__eq__
uses Woodwork's equality check df.ww.__eq__(df2.ww)
which only looks at dataframe equality for pandas DataFrames and uses df.equals(df2)
to do that equality check.
Before Woodwork integration, the dataframes in Entities were compared with _dataframes_equal
, which allowed for dask, koalas, and pandas and should be faster. However it seems that the function was incorrect, and in order to confidently use it in the woodwork-integrated EntitySet equality, some changes would have to be made along with more testing.
To use, we should determine the following:
- What are the limitations of using this instead of pandas' equality check? (is it less accurate)
- Will it trigger compute operations for dask and/or koalas?
- Is it actually faster than the pandas equality check?
The function as it was at the time the issue was created:
def _dataframes_equal(df1, df2):
# ^ means XOR
df1_empty = bool(len(df1)) # this is True when df is not empty
df2_empty = bool(len(df2))
if df1_empty ^ df2_empty:
return False
elif not df1_empty and not df2_empty:
if not set(df1.columns) == set(df2.columns):
return False
for c in df1:
df1c = df1[c]
df2c = df2[c]
if df1c.dtype == object:
df1c = df1c.astype('unicode')
if df2c.dtype == object:
df2c = df2c.astype('unicode')
normal_compare = True
if df1c.dtype == object:
dropped = df1c.dropna()
if not dropped.empty:
if isinstance(dropped.iloc[0], tuple):
dropped2 = df2[c].dropna()
normal_compare = False
for i in range(len(dropped.iloc[0])):
try:
equal = dropped.apply(lambda x: x[i]).equals(
dropped2.apply(lambda x: x[i]))
except IndexError:
raise IndexError("If column data are tuples, they must all be the same length")
if not equal:
return False
if normal_compare:
# handle nan equality correctly
# This way is much faster than df1.equals(df2)
result = df1c == df2c
result[pd.isnull(df1c) == pd.isnull(df2c)] = True
if not result.all():
return False
return True
Behavior:
from featuretools.utils.wrangle import _dataframes_equal
from featuretools.demo import load_flight, load_retail
es1 = load_flight()
es2 = load_flight()
df1 = es1['trip_logs']
df2 = es1['trip_logs']
assert _dataframes_equal(df1, df2)
assert df1.equals(df2)
diff_idx_df = df1.reset_index()
diff_dtypes_df = df1.astype({
'taxi_out':'Int64', # double --> int
'dep_delay':'string', # double --> string
'canceled':'int64', # bool --> int
'taxi_out':'datetime64[ns]' # double --> datetime
})
diff_values_df = df1.replace({True:False})
empty_df = df1.loc[[], :]
partial_df = df1.iloc[4000:,:]
nans_df = df1.replace({False:None})
assert _dataframes_equal(df1, diff_dtypes_df)
assert not _dataframes_equal(df1, empty_df)
assert _dataframes_equal(df1, diff_idx_df)
assert not df1.equals(diff_idx_df)
assert not df1.equals(diff_dtypes_df)
assert not df1.equals(partial_df)
assert not df1.equals(diff_values_df)
#unexpected
assert _dataframes_equal(df1, partial_df)
assert _dataframes_equal(df1, diff_values_df)
assert _dataframes_equal(df1, nans_df)
potential equality method:
def __eq__(self, other, deep=False):
if self.id != other.id:
return False
if self.time_type != other.time_type:
return False
if len(self.dataframe_dict) != len(other.dataframe_dict):
return False
for df_name, df in self.dataframe_dict.items():
if df_name not in other.dataframe_dict:
return False
if df.ww.make_index != df.ww.make_index:
return False
if not df.ww._schema.__eq__(other[df_name].ww._schema, deep=deep):
return False
if deep and not _dataframes_equal(df, other[df_name]):
return False
if not len(self.relationships) == len(other.relationships):
return False
for r in self.relationships:
if r not in other.relationships:
return False
return True