pandas-log icon indicating copy to clipboard operation
pandas-log copied to clipboard

TypeError: data type not understood

Open mattharrison opened this issue 3 years ago • 2 comments

Brief Description

I'm trying to run pandas-log on my chain and it fails with the error:

TypeError: data type not understood

System Information

  • Python version (required): Python 3.8.5
  • Pandas version: 1.3.2

Minimally Reproducible Code

import pandas as pd
autos = pd.read_csv('https://github.com/mattharrison/datasets/raw/master/data/vehicles.csv.zip')
def to_tz(df_, time_col, tz_offset, tz_name):
    return (df_
             .groupby(tz_offset)
             [time_col]
             .transform(lambda s: pd.to_datetime(s)
                 .dt.tz_localize(s.name, ambiguous=True)
                 .dt.tz_convert(tz_name))
            )


def tweak_autos(autos):
    cols = ['city08', 'comb08', 'highway08', 'cylinders', 'displ', 'drive', 'eng_dscr', 
        'fuelCost08', 'make', 'model', 'trany', 'range', 'createdOn', 'year']
    return (autos
     [cols]
     .assign(cylinders=autos.cylinders.fillna(0).astype('int8'),
             displ=autos.displ.fillna(0).astype('float16'),
             drive=autos.drive.fillna('Other').astype('category'),
             automatic=autos.trany.str.contains('Auto'),
             speeds=autos.trany.str.extract(r'(\d)+').fillna('20').astype('int8'),
             tz=autos.createdOn.str.extract(r'\d\d:\d\d ([A-Z]{3}?)').replace('EDT', 'EST5EDT'),
             str_date=(autos.createdOn.str.slice(4,19) + ' ' + autos.createdOn.str.slice(-4)),
             createdOn=lambda df_: to_tz(df_, 'str_date', 'tz', 'US/Eastern'),
             ffs=autos.eng_dscr.str.contains('FFS')
            )
     .pipe(show, rows=2, title='New Cols')            
     .astype({'highway08': 'int8', 'city08': 'int16', 'comb08': 'int16', 'fuelCost08': 'int16',
              'range': 'int16',  'year': 'int16', 'make': 'category'})
     .drop(columns=['trany', 'eng_dscr'])
    )
import pandas_log
with pandas_log.enable():
    tweak_autos(autos)

Error Messages

1) fillna(value: 'object | ArrayLike | None' ="20", method: 'FillnaOptions | None' = None, axis: 'Axis | None' = None, inplace: 'bool' = False, limit=None, downcast=None):
	Metadata:
	* Filled 837 with 20.
	Execution Stats:
	* Execution time: Step Took 0.001512 seconds.

1) replace(to_replace="EDT", value="EST5EDT", inplace: 'bool' = False, limit=None, regex: 'bool' = False, method: 'str' = 'pad'):
	Execution Stats:
	* Execution time: Step Took 0.001215 seconds.

1) groupby(by="tz", axis: 'Axis' = 0, level: 'Level | None' = None, as_index: 'bool' = True, sort: 'bool' = True, group_keys: 'bool' = True, squeeze: 'bool | lib.NoDefault' = <no_default>, observed: 'bool' = False, dropna: 'bool' = True):
	Metadata:
	* Grouping by tz resulted in 2 groups like 
		EST,
		EST5EDT,
	  and more.
	Execution Stats:
	* Execution time: Step Took 0.006409 seconds.
/home/matt/envs/menv/lib/python3.8/site-packages/pandas_log/patched_logs_functions.py:249: UserWarning: Some pandas logging may involve copying dataframes, which can be time-/memory-intensive. Consider passing copy_ok=False to the enable/auto_enable functions in pandas_log if issues arise.
  warnings.warn(COPY_WARNING_MSG)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-1-f6bfc55c635b> in <module>
     33 import pandas_log
     34 with pandas_log.enable():
---> 35     tweak_autos(autos)

<ipython-input-1-f6bfc55c635b> in tweak_autos(autos)
     14     cols = ['city08', 'comb08', 'highway08', 'cylinders', 'displ', 'drive', 'eng_dscr', 
     15         'fuelCost08', 'make', 'model', 'trany', 'range', 'createdOn', 'year']
---> 16     return (autos
     17      [cols]
     18      .assign(cylinders=autos.cylinders.fillna(0).astype('int8'),

~/envs/menv/lib/python3.8/site-packages/pandas_flavor/register.py in __call__(self, *args, **kwargs)
     27             @wraps(method)
     28             def __call__(self, *args, **kwargs):
---> 29                 return method(self._obj, *args, **kwargs)
     30 
     31         register_dataframe_accessor(method.__name__)(AccessorMethod)

~/envs/menv/lib/python3.8/site-packages/pandas_log/pandas_log.py in wrapped(*args, **fn_kwargs)
    184 
    185             input_df, fn_args = args[0], args[1:]
--> 186             output_df = _run_method_and_calc_stats(
    187                 fn,
    188                 fn_args,

~/envs/menv/lib/python3.8/site-packages/pandas_log/pandas_log.py in _run_method_and_calc_stats(fn, fn_args, fn_kwargs, input_df, full_signature, silent, verbose, copy_ok, calculate_memory)
    168             output_df,
    169         )
--> 170         step_stats.log_stats_if_needed(silent, verbose, copy_ok)
    171         if isinstance(output_df, pd.DataFrame) or isinstance(output_df, pd.Series):
    172             step_stats.persist_execution_stats()

~/envs/menv/lib/python3.8/site-packages/pandas_log/pandas_execution_stats.py in log_stats_if_needed(self, silent, verbose, copy_ok)
    106 
    107         if verbose or self.fn.__name__ not in DATAFRAME_ADDITIONAL_METHODS_TO_OVERIDE:
--> 108             s = self.__repr__(verbose, copy_ok)
    109             if s:
    110                 # If this method isn't patched and verbose is False, __repr__ will give an empty string, which

~/envs/menv/lib/python3.8/site-packages/pandas_log/pandas_execution_stats.py in __repr__(self, verbose, copy_ok)
    147 
    148         # Step Metadata stats
--> 149         logs, tips = self.get_logs_for_specifc_method(verbose, copy_ok)
    150         metadata_stats = f"\033[4mMetadata\033[0m:\n{logs}" if logs else ""
    151         metadata_tips = f"\033[4mTips\033[0m:\n{tips}" if tips else ""

~/envs/menv/lib/python3.8/site-packages/pandas_log/pandas_execution_stats.py in get_logs_for_specifc_method(self, verbose, copy_ok)
    128 
    129         log_method = partial(log_method, self.output_df, self.input_df)
--> 130         logs, tips = log_method(*self.fn_args, **self.fn_kwargs)
    131         return logs, tips
    132 

~/envs/menv/lib/python3.8/site-packages/pandas_log/patched_logs_functions.py in log_assign(output_df, input_df, **kwargs)
    250             # If copying is ok, we can check how many values actually changed
    251             for col in changed_cols:
--> 252                 values_changed, values_unchanged = num_values_changed(
    253                     input_df[col], output_df[col]
    254                 )

~/envs/menv/lib/python3.8/site-packages/pandas_log/patched_logs_functions.py in num_values_changed(input_obj, output_obj)
    127         isinstance(input_obj, pd.Series)
    128         and isinstance(output_obj, pd.Series)
--> 129         and input_obj.dtype != output_obj.dtype
    130     ):
    131         # Comparing values for equality across dtypes wouldn't be well-defined so we just say they all changed

TypeError: Cannot interpret 'datetime64[ns, US/Eastern]' as a data type

mattharrison avatar Sep 03 '21 19:09 mattharrison

Note that I commented out the line:

createdOn=lambda df_: to_tz(df_, 'str_date', 'tz', 'US/Eastern'),

And re-ran the code and got this error:

/home/matt/envs/menv/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3418: DtypeWarning: Columns (68,70,71,72,73,74,76,79) have mixed types.Specify dtype option on import or set low_memory=False.
  exec(code_obj, self.user_global_ns, self.user_ns)

1) fillna(value: 'object | ArrayLike | None' ="20", method: 'FillnaOptions | None' = None, axis: 'Axis | None' = None, inplace: 'bool' = False, limit=None, downcast=None):
	Metadata:
	* Filled 837 with 20.
	Execution Stats:
	* Execution time: Step Took 0.001567 seconds.

1) replace(to_replace="EDT", value="EST5EDT", inplace: 'bool' = False, limit=None, regex: 'bool' = False, method: 'str' = 'pad'):
	Execution Stats:
	* Execution time: Step Took 0.003579 seconds.
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-4-376a7e1a7d6b> in <module>
     33 import pandas_log
     34 with pandas_log.enable():
---> 35     tweak_autos(autos)

<ipython-input-4-376a7e1a7d6b> in tweak_autos(autos)
     14     cols = ['city08', 'comb08', 'highway08', 'cylinders', 'displ', 'drive', 'eng_dscr', 
     15         'fuelCost08', 'make', 'model', 'trany', 'range', 'createdOn', 'year']
---> 16     return (autos
     17      [cols]
     18      .assign(cylinders=autos.cylinders.fillna(0).astype('int8'),

~/envs/menv/lib/python3.8/site-packages/pandas_flavor/register.py in __call__(self, *args, **kwargs)
     27             @wraps(method)
     28             def __call__(self, *args, **kwargs):
---> 29                 return method(self._obj, *args, **kwargs)
     30 
     31         register_dataframe_accessor(method.__name__)(AccessorMethod)

~/envs/menv/lib/python3.8/site-packages/pandas_log/pandas_log.py in wrapped(*args, **fn_kwargs)
    184 
    185             input_df, fn_args = args[0], args[1:]
--> 186             output_df = _run_method_and_calc_stats(
    187                 fn,
    188                 fn_args,

~/envs/menv/lib/python3.8/site-packages/pandas_log/pandas_log.py in _run_method_and_calc_stats(fn, fn_args, fn_kwargs, input_df, full_signature, silent, verbose, copy_ok, calculate_memory)
    168             output_df,
    169         )
--> 170         step_stats.log_stats_if_needed(silent, verbose, copy_ok)
    171         if isinstance(output_df, pd.DataFrame) or isinstance(output_df, pd.Series):
    172             step_stats.persist_execution_stats()

~/envs/menv/lib/python3.8/site-packages/pandas_log/pandas_execution_stats.py in log_stats_if_needed(self, silent, verbose, copy_ok)
    106 
    107         if verbose or self.fn.__name__ not in DATAFRAME_ADDITIONAL_METHODS_TO_OVERIDE:
--> 108             s = self.__repr__(verbose, copy_ok)
    109             if s:
    110                 # If this method isn't patched and verbose is False, __repr__ will give an empty string, which

~/envs/menv/lib/python3.8/site-packages/pandas_log/pandas_execution_stats.py in __repr__(self, verbose, copy_ok)
    147 
    148         # Step Metadata stats
--> 149         logs, tips = self.get_logs_for_specifc_method(verbose, copy_ok)
    150         metadata_stats = f"\033[4mMetadata\033[0m:\n{logs}" if logs else ""
    151         metadata_tips = f"\033[4mTips\033[0m:\n{tips}" if tips else ""

~/envs/menv/lib/python3.8/site-packages/pandas_log/pandas_execution_stats.py in get_logs_for_specifc_method(self, verbose, copy_ok)
    128 
    129         log_method = partial(log_method, self.output_df, self.input_df)
--> 130         logs, tips = log_method(*self.fn_args, **self.fn_kwargs)
    131         return logs, tips
    132 

~/envs/menv/lib/python3.8/site-packages/pandas_log/patched_logs_functions.py in log_assign(output_df, input_df, **kwargs)
    250             # If copying is ok, we can check how many values actually changed
    251             for col in changed_cols:
--> 252                 values_changed, values_unchanged = num_values_changed(
    253                     input_df[col], output_df[col]
    254                 )

~/envs/menv/lib/python3.8/site-packages/pandas_log/patched_logs_functions.py in num_values_changed(input_obj, output_obj)
    127         isinstance(input_obj, pd.Series)
    128         and isinstance(output_obj, pd.Series)
--> 129         and input_obj.dtype != output_obj.dtype
    130     ):
    131         # Comparing values for equality across dtypes wouldn't be well-defined so we just say they all changed

TypeError: Cannot interpret 'CategoricalDtype(categories=['2-Wheel Drive', '4-Wheel Drive',
                  '4-Wheel or All-Wheel Drive', 'All-Wheel Drive',
                  'Front-Wheel Drive', 'Other', 'Part-time 4-Wheel Drive',
                  'Rear-Wheel Drive'],
, ordered=False)' as a data type

mattharrison avatar Sep 03 '21 19:09 mattharrison

Also note that these failures were not handled correctly by the context manager, and if I try to run tweak_autos normally, it tries to use pandas-log and fails. Maybe this warrants its own bug.

mattharrison avatar Sep 03 '21 19:09 mattharrison