featuretools
featuretools copied to clipboard
CumMean and CumSum can fail on all null columns
CumMean and CumSum can fail on all null columns
The CumMean
and CumSum
primitives can fail during calculate feature matrix if a numeric column with all pd.NA
values is present. The failure happens when Featuretools attempts to initialize Woodwork on the feature matrix.
import pandas as pd
import featuretools as ft
df = pd.DataFrame({
'id': [0, 1, 2],
'null_ints': [pd.NA]*3
})
es = ft.EntitySet('test')
es.add_dataframe(dataframe_name="test_df",
dataframe=df,
index='id',
logical_types={'null_ints': 'IntegerNullable'})
ft.dfs(entityset=es, target_dataframe_name='test_df', trans_primitives=['cum_mean'])
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
~/dev/featuretools/env/lib/python3.8/site-packages/woodwork/logical_types.py in transform(self, series)
57 try:
---> 58 series = series.astype(new_dtype)
59 except (TypeError, ValueError):
~/dev/featuretools/env/lib/python3.8/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors)
5814 # else, only a single dtype is given
-> 5815 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
5816 return self._constructor(new_data).__finalize__(self, method="astype")
~/dev/featuretools/env/lib/python3.8/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors)
417 def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
--> 418 return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
419
~/dev/featuretools/env/lib/python3.8/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, ignore_failures, **kwargs)
326 else:
--> 327 applied = getattr(b, f)(**kwargs)
328 except (TypeError, NotImplementedError):
~/dev/featuretools/env/lib/python3.8/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors)
591
--> 592 new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
593
~/dev/featuretools/env/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in astype_array_safe(values, dtype, copy, errors)
1308 try:
-> 1309 new_values = astype_array(values, dtype, copy=copy)
1310 except (ValueError, TypeError):
~/dev/featuretools/env/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in astype_array(values, dtype, copy)
1256 else:
-> 1257 values = astype_nansafe(values, dtype, copy=copy)
1258
~/dev/featuretools/env/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
1200 # Explicit copy, or required since NumPy can't view from / to object.
-> 1201 return arr.astype(dtype, copy=True)
1202
TypeError: float() argument must be a string or a number, not 'NAType'
During handling of the above exception, another exception occurred:
TypeConversionError Traceback (most recent call last)
<ipython-input-15-8a9a157a6521> in <module>
----> 1 ft.dfs(entityset=es, target_dataframe_name='test_df', trans_primitives=['cum_mean'])
~/dev/featuretools/featuretools/utils/entry_point.py in function_wrapper(*args, **kwargs)
38 ep.on_error(error=e,
39 runtime=runtime)
---> 40 raise e
41
42 # send return value
~/dev/featuretools/featuretools/utils/entry_point.py in function_wrapper(*args, **kwargs)
30 # call function
31 start = time.time()
---> 32 return_value = func(*args, **kwargs)
33 runtime = time.time() - start
34 except Exception as e:
~/dev/featuretools/featuretools/synthesis/dfs.py in dfs(dataframes, relationships, entityset, target_dataframe_name, cutoff_time, instance_ids, agg_primitives, trans_primitives, groupby_trans_primitives, allowed_paths, max_depth, ignore_dataframes, ignore_columns, primitive_options, seed_features, drop_contains, drop_exact, where_primitives, max_features, cutoff_time_in_index, save_progress, features_only, training_window, approximate, chunk_size, n_jobs, dask_kwargs, verbose, return_types, progress_callback, include_cutoff_time)
276 return features
277
--> 278 feature_matrix = calculate_feature_matrix(features,
279 entityset=entityset,
280 cutoff_time=cutoff_time,
~/dev/featuretools/featuretools/computational_backends/calculate_feature_matrix.py in calculate_feature_matrix(features, entityset, cutoff_time, instance_ids, dataframes, relationships, cutoff_time_in_index, training_window, approximate, save_progress, verbose, chunk_size, n_jobs, dask_kwargs, progress_callback, include_cutoff_time)
291 include_cutoff_time=include_cutoff_time)
292 else:
--> 293 feature_matrix = calculate_chunk(cutoff_time=cutoff_time_to_pass,
294 chunk_size=chunk_size,
295 feature_set=feature_set,
~/dev/featuretools/featuretools/computational_backends/calculate_feature_matrix.py in calculate_chunk(cutoff_time, chunk_size, feature_set, entityset, approximate, training_window, save_progress, no_unapproximated_aggs, cutoff_df_time_col, target_time, pass_columns, progress_bar, progress_callback, include_cutoff_time)
487
488 ww_init_kwargs = get_ww_types_from_features(feature_set.target_features, entityset, pass_columns, cutoff_time)
--> 489 feature_matrix = init_ww_and_concat_fm(feature_matrix, ww_init_kwargs)
490 return feature_matrix
491
~/dev/featuretools/featuretools/computational_backends/calculate_feature_matrix.py in init_ww_and_concat_fm(feature_matrix, ww_init_kwargs)
756 def init_ww_and_concat_fm(feature_matrix, ww_init_kwargs):
757 for fm in feature_matrix:
--> 758 fm.ww.init(**ww_init_kwargs)
759
760 if any(isinstance(fm, dd.DataFrame) for fm in feature_matrix):
~/dev/featuretools/env/lib/python3.8/site-packages/woodwork/table_accessor.py in init(self, **kwargs)
95 Any errors resulting from skipping validation with invalid inputs may not be easily understood.
96 """
---> 97 self.init_with_partial_schema(**kwargs)
98
99 def init_with_full_schema(self, schema: TableSchema, validate: bool = True, **kwargs) -> None:
~/dev/featuretools/env/lib/python3.8/site-packages/woodwork/table_accessor.py in init_with_partial_schema(self, schema, index, time_index, logical_types, already_sorted, name, semantic_tags, table_metadata, column_metadata, use_standard_tags, column_descriptions, column_origins, validate, **kwargs)
202
203 # overwrite schema parameters with specified kwargs
--> 204 logical_types = _infer_missing_logical_types(self._dataframe, logical_types, existing_logical_types)
205 column_descriptions = {**existing_col_descriptions, **(column_descriptions or {})}
206 column_metadata = {**existing_col_metadata, **(column_metadata or {})}
~/dev/featuretools/env/lib/python3.8/site-packages/woodwork/table_accessor.py in _infer_missing_logical_types(dataframe, force_logical_types, existing_logical_types)
1037 logical_type = force_logical_types.get(name) if name in force_logical_types else existing_logical_types.get(name)
1038 parsed_logical_types[name] = _get_column_logical_type(series, logical_type, name)
-> 1039 updated_series = parsed_logical_types[name].transform(series)
1040 if updated_series is not series:
1041 dataframe[name] = updated_series
~/dev/featuretools/env/lib/python3.8/site-packages/woodwork/logical_types.py in transform(self, series)
58 series = series.astype(new_dtype)
59 except (TypeError, ValueError):
---> 60 raise TypeConversionError(series, new_dtype, type(self))
61 return series
62
TypeConversionError: Error converting datatype for CUM_MEAN(null_ints) from type object to type float64. Please confirm the underlying data is consistent with logical type Double.
@thehomebrewnerd moving this to Woodwork Integration Follow Up Epic
I think the fix for this would be to give Double logical type the dtype of Float64
, but may cause issues as described here.
There is a pandas issue:
import pandas as pd
df = pd.DataFrame({
'id': [0, 1, 2],
'null_ints': [pd.NA] * 3
})
df['null_ints'] = df['null_ints'].astype("object").astype("float64")
Opened a ticket with Pandas.