featuretools CumMean and CumSum can fail on all null columns

CumMean and CumSum can fail on all null columns

The CumMean and CumSum primitives can fail during calculate feature matrix if a numeric column with all pd.NA values is present. The failure happens when Featuretools attempts to initialize Woodwork on the feature matrix.

import pandas as pd
import featuretools as ft

df = pd.DataFrame({
    'id': [0, 1, 2],
    'null_ints': [pd.NA]*3
})

es = ft.EntitySet('test')
es.add_dataframe(dataframe_name="test_df",
                 dataframe=df,
                 index='id',
                 logical_types={'null_ints': 'IntegerNullable'})

ft.dfs(entityset=es, target_dataframe_name='test_df', trans_primitives=['cum_mean'])

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
~/dev/featuretools/env/lib/python3.8/site-packages/woodwork/logical_types.py in transform(self, series)
     57             try:
---> 58                 series = series.astype(new_dtype)
     59             except (TypeError, ValueError):

~/dev/featuretools/env/lib/python3.8/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors)
   5814             # else, only a single dtype is given
-> 5815             new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
   5816             return self._constructor(new_data).__finalize__(self, method="astype")

~/dev/featuretools/env/lib/python3.8/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors)
    417     def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
--> 418         return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
    419 

~/dev/featuretools/env/lib/python3.8/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, ignore_failures, **kwargs)
    326                 else:
--> 327                     applied = getattr(b, f)(**kwargs)
    328             except (TypeError, NotImplementedError):

~/dev/featuretools/env/lib/python3.8/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors)
    591 
--> 592         new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
    593 

~/dev/featuretools/env/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in astype_array_safe(values, dtype, copy, errors)
   1308     try:
-> 1309         new_values = astype_array(values, dtype, copy=copy)
   1310     except (ValueError, TypeError):

~/dev/featuretools/env/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in astype_array(values, dtype, copy)
   1256     else:
-> 1257         values = astype_nansafe(values, dtype, copy=copy)
   1258 

~/dev/featuretools/env/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
   1200         # Explicit copy, or required since NumPy can't view from / to object.
-> 1201         return arr.astype(dtype, copy=True)
   1202 

TypeError: float() argument must be a string or a number, not 'NAType'

During handling of the above exception, another exception occurred:

TypeConversionError                       Traceback (most recent call last)
<ipython-input-15-8a9a157a6521> in <module>
----> 1 ft.dfs(entityset=es, target_dataframe_name='test_df', trans_primitives=['cum_mean'])

~/dev/featuretools/featuretools/utils/entry_point.py in function_wrapper(*args, **kwargs)
     38                     ep.on_error(error=e,
     39                                 runtime=runtime)
---> 40                 raise e
     41 
     42             # send return value

~/dev/featuretools/featuretools/utils/entry_point.py in function_wrapper(*args, **kwargs)
     30                 # call function
     31                 start = time.time()
---> 32                 return_value = func(*args, **kwargs)
     33                 runtime = time.time() - start
     34             except Exception as e:

~/dev/featuretools/featuretools/synthesis/dfs.py in dfs(dataframes, relationships, entityset, target_dataframe_name, cutoff_time, instance_ids, agg_primitives, trans_primitives, groupby_trans_primitives, allowed_paths, max_depth, ignore_dataframes, ignore_columns, primitive_options, seed_features, drop_contains, drop_exact, where_primitives, max_features, cutoff_time_in_index, save_progress, features_only, training_window, approximate, chunk_size, n_jobs, dask_kwargs, verbose, return_types, progress_callback, include_cutoff_time)
    276         return features
    277 
--> 278     feature_matrix = calculate_feature_matrix(features,
    279                                               entityset=entityset,
    280                                               cutoff_time=cutoff_time,

~/dev/featuretools/featuretools/computational_backends/calculate_feature_matrix.py in calculate_feature_matrix(features, entityset, cutoff_time, instance_ids, dataframes, relationships, cutoff_time_in_index, training_window, approximate, save_progress, verbose, chunk_size, n_jobs, dask_kwargs, progress_callback, include_cutoff_time)
    291                                                        include_cutoff_time=include_cutoff_time)
    292         else:
--> 293             feature_matrix = calculate_chunk(cutoff_time=cutoff_time_to_pass,
    294                                              chunk_size=chunk_size,
    295                                              feature_set=feature_set,

~/dev/featuretools/featuretools/computational_backends/calculate_feature_matrix.py in calculate_chunk(cutoff_time, chunk_size, feature_set, entityset, approximate, training_window, save_progress, no_unapproximated_aggs, cutoff_df_time_col, target_time, pass_columns, progress_bar, progress_callback, include_cutoff_time)
    487 
    488     ww_init_kwargs = get_ww_types_from_features(feature_set.target_features, entityset, pass_columns, cutoff_time)
--> 489     feature_matrix = init_ww_and_concat_fm(feature_matrix, ww_init_kwargs)
    490     return feature_matrix
    491 

~/dev/featuretools/featuretools/computational_backends/calculate_feature_matrix.py in init_ww_and_concat_fm(feature_matrix, ww_init_kwargs)
    756 def init_ww_and_concat_fm(feature_matrix, ww_init_kwargs):
    757     for fm in feature_matrix:
--> 758         fm.ww.init(**ww_init_kwargs)
    759 
    760     if any(isinstance(fm, dd.DataFrame) for fm in feature_matrix):

~/dev/featuretools/env/lib/python3.8/site-packages/woodwork/table_accessor.py in init(self, **kwargs)
     95                 Any errors resulting from skipping validation with invalid inputs may not be easily understood.
     96         """
---> 97         self.init_with_partial_schema(**kwargs)
     98 
     99     def init_with_full_schema(self, schema: TableSchema, validate: bool = True, **kwargs) -> None:

~/dev/featuretools/env/lib/python3.8/site-packages/woodwork/table_accessor.py in init_with_partial_schema(self, schema, index, time_index, logical_types, already_sorted, name, semantic_tags, table_metadata, column_metadata, use_standard_tags, column_descriptions, column_origins, validate, **kwargs)
    202 
    203         # overwrite schema parameters with specified kwargs
--> 204         logical_types = _infer_missing_logical_types(self._dataframe, logical_types, existing_logical_types)
    205         column_descriptions = {**existing_col_descriptions, **(column_descriptions or {})}
    206         column_metadata = {**existing_col_metadata, **(column_metadata or {})}

~/dev/featuretools/env/lib/python3.8/site-packages/woodwork/table_accessor.py in _infer_missing_logical_types(dataframe, force_logical_types, existing_logical_types)
   1037         logical_type = force_logical_types.get(name) if name in force_logical_types else existing_logical_types.get(name)
   1038         parsed_logical_types[name] = _get_column_logical_type(series, logical_type, name)
-> 1039         updated_series = parsed_logical_types[name].transform(series)
   1040         if updated_series is not series:
   1041             dataframe[name] = updated_series

~/dev/featuretools/env/lib/python3.8/site-packages/woodwork/logical_types.py in transform(self, series)
     58                 series = series.astype(new_dtype)
     59             except (TypeError, ValueError):
---> 60                 raise TypeConversionError(series, new_dtype, type(self))
     61         return series
     62 

TypeConversionError: Error converting datatype for CUM_MEAN(null_ints) from type object to type float64. Please confirm the underlying data is consistent with logical type Double.

Sep 08 '21 18:09 thehomebrewnerd

@thehomebrewnerd moving this to Woodwork Integration Follow Up Epic

Sep 14 '21 13:09 gsheni

I think the fix for this would be to give Double logical type the dtype of Float64, but may cause issues as described here.

May 09 '22 14:05 dvreed77

There is a pandas issue:

import pandas as pd

df = pd.DataFrame({
    'id': [0, 1, 2],
    'null_ints': [pd.NA] * 3
})

df['null_ints'] = df['null_ints'].astype("object").astype("float64")

May 09 '22 15:05 gsheni

Opened a ticket with Pandas.

May 10 '22 14:05 dvreed77

featuretools featuretools copied to clipboard

CumMean and CumSum can fail on all null columns

CumMean and CumSum can fail on all null columns

featuretools
featuretools copied to clipboard