featuretools icon indicating copy to clipboard operation
featuretools copied to clipboard

Investigate `_dataframes_equal` for EntitySet equality

Open tamargrey opened this issue 3 years ago • 0 comments

Currently EntitySet.__eq__ uses Woodwork's equality check df.ww.__eq__(df2.ww) which only looks at dataframe equality for pandas DataFrames and uses df.equals(df2) to do that equality check.

Before Woodwork integration, the dataframes in Entities were compared with _dataframes_equal, which allowed for dask, koalas, and pandas and should be faster. However it seems that the function was incorrect, and in order to confidently use it in the woodwork-integrated EntitySet equality, some changes would have to be made along with more testing.

To use, we should determine the following:

  • What are the limitations of using this instead of pandas' equality check? (is it less accurate)
  • Will it trigger compute operations for dask and/or koalas?
  • Is it actually faster than the pandas equality check?

The function as it was at the time the issue was created:

def _dataframes_equal(df1, df2):
    # ^ means XOR
    df1_empty = bool(len(df1)) # this is True when df is not empty
    df2_empty = bool(len(df2))
    if df1_empty ^ df2_empty:
        return False
    elif not df1_empty and not df2_empty:
        if not set(df1.columns) == set(df2.columns):
            return False

        for c in df1:
            df1c = df1[c]
            df2c = df2[c]
            if df1c.dtype == object:
                df1c = df1c.astype('unicode')
            if df2c.dtype == object:
                df2c = df2c.astype('unicode')

            normal_compare = True
            if df1c.dtype == object:
                dropped = df1c.dropna()
                if not dropped.empty:
                    if isinstance(dropped.iloc[0], tuple):
                        dropped2 = df2[c].dropna()
                        normal_compare = False
                        for i in range(len(dropped.iloc[0])):
                            try:
                                equal = dropped.apply(lambda x: x[i]).equals(
                                    dropped2.apply(lambda x: x[i]))
                            except IndexError:
                                raise IndexError("If column data are tuples, they must all be the same length")
                            if not equal:
                                return False
            if normal_compare:
                # handle nan equality correctly
                # This way is much faster than df1.equals(df2)
                result = df1c == df2c
                result[pd.isnull(df1c) == pd.isnull(df2c)] = True
                if not result.all():
                    return False
    return True

Behavior:

from featuretools.utils.wrangle import _dataframes_equal
from featuretools.demo import load_flight, load_retail

es1 = load_flight()
es2 = load_flight()

df1 = es1['trip_logs']
df2 = es1['trip_logs']

assert _dataframes_equal(df1, df2)
assert df1.equals(df2)

diff_idx_df = df1.reset_index()
diff_dtypes_df = df1.astype({
                             'taxi_out':'Int64', # double --> int
                             'dep_delay':'string', # double --> string
                             'canceled':'int64',  # bool --> int
                             'taxi_out':'datetime64[ns]' # double --> datetime
                            }) 
diff_values_df = df1.replace({True:False})
empty_df = df1.loc[[], :]
partial_df = df1.iloc[4000:,:]
nans_df = df1.replace({False:None})


assert _dataframes_equal(df1, diff_dtypes_df)
assert not _dataframes_equal(df1, empty_df)
assert _dataframes_equal(df1, diff_idx_df)

assert not df1.equals(diff_idx_df)
assert not df1.equals(diff_dtypes_df)
assert not df1.equals(partial_df)
assert not df1.equals(diff_values_df)

#unexpected
assert _dataframes_equal(df1, partial_df)
assert _dataframes_equal(df1, diff_values_df)
assert _dataframes_equal(df1, nans_df)

potential equality method:

    def __eq__(self, other, deep=False):
        if self.id != other.id:
            return False
        if self.time_type != other.time_type:
            return False
        if len(self.dataframe_dict) != len(other.dataframe_dict):
            return False
        for df_name, df in self.dataframe_dict.items():
            if df_name not in other.dataframe_dict:
                return False
            if df.ww.make_index != df.ww.make_index:
                return False
            if not df.ww._schema.__eq__(other[df_name].ww._schema, deep=deep):
                return False
            if deep and not _dataframes_equal(df, other[df_name]):
                return False

        if not len(self.relationships) == len(other.relationships):
            return False
        for r in self.relationships:
            if r not in other.relationships:
                return False
        return True

tamargrey avatar Jun 10 '21 19:06 tamargrey