mars icon indicating copy to clipboard operation
mars copied to clipboard

[BUG] `groupby` failed when using categorical columns with `as_index=False`

Open hekaisheng opened this issue 2 years ago • 1 comments

Describe the bug groupby failed when using categorical columns with as_index=False.

To Reproduce

In [14]: a = pd.DataFrame({'a':['a','b', 'c'] * 5, 'b': ['d', 'e', 'f'] * 5, 'c': range(15)}).astype({'a': "category", "b": 'category'})

In [15]: df = md.DataFrame(a)

In [16]: df.groupby(['a', 'b'], as_index=False).agg({'c': 'sum'}).execute()
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Input In [16], in <cell line: 1>()
----> 1 df.groupby(['a', 'b'], as_index=False).agg({'c': 'sum'}).execute()

File ~/Documents/mars/mars/dataframe/groupby/aggregation.py:1308, in agg(groupby, func, method, combine_size, *args, **kwargs)
   1298 use_inf_as_na = kwargs.pop("_use_inf_as_na", options.dataframe.mode.use_inf_as_na)
   1299 agg_op = DataFrameGroupByAgg(
   1300     raw_func=func,
   1301     raw_func_kw=kwargs,
   (...)
   1306     use_inf_as_na=use_inf_as_na,
   1307 )
-> 1308 return agg_op(groupby)

File ~/Documents/mars/mars/core/mode.py:77, in _EnterModeFuncWrapper.__call__.<locals>._inner(*args, **kwargs)
     74 @functools.wraps(func)
     75 def _inner(*args, **kwargs):
     76     with enter_mode(**mode_name_to_value):
---> 77         return func(*args, **kwargs)

File ~/Documents/mars/mars/dataframe/groupby/aggregation.py:297, in DataFrameGroupByAgg.__call__(self, groupby)
    290     self.output_types = (
    291         [OutputType.dataframe]
    292         if groupby.op.output_types[0] == OutputType.dataframe_groupby
    293         else [OutputType.series]
    294     )
    296 if self.output_types[0] == OutputType.dataframe:
--> 297     return self._call_dataframe(groupby, df)
    298 else:
    299     return self._call_series(groupby, df)

File ~/Documents/mars/mars/dataframe/groupby/aggregation.py:224, in DataFrameGroupByAgg._call_dataframe(self, groupby, input_df)
    223 def _call_dataframe(self, groupby, input_df):
--> 224     agg_df = build_mock_agg_result(
    225         groupby, self.groupby_params, self.raw_func, **self.raw_func_kw
    226     )
    228     shape = (np.nan, agg_df.shape[1])
    229     index_value = parse_index(agg_df.index, groupby.key, groupby.index_value.key)

File ~/Documents/mars/mars/dataframe/groupby/aggregation.py:141, in build_mock_agg_result(groupby, groupby_params, raw_func, **raw_func_kw)
    134 def build_mock_agg_result(
    135     groupby: GROUPBY_TYPE,
    136     groupby_params: Dict,
    137     raw_func: Callable,
    138     **raw_func_kw,
    139 ):
    140     try:
--> 141         agg_result = groupby.op.build_mock_groupby().aggregate(raw_func, **raw_func_kw)
    142     except ValueError:
    143         if (
    144             groupby_params.get("as_index") or _support_get_group_without_as_index
    145         ):  # pragma: no cover

File ~/miniconda3/lib/python3.8/site-packages/pandas/core/groupby/generic.py:924, in DataFrameGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
    921                 result.columns = result.columns.droplevel(-1)
    923 if not self.as_index:
--> 924     self._insert_inaxis_grouper_inplace(result)
    925     result.index = Index(range(len(result)))
    927 return result

File ~/miniconda3/lib/python3.8/site-packages/pandas/core/groupby/generic.py:1407, in DataFrameGroupBy._insert_inaxis_grouper_inplace(self, result)
   1399 for name, lev, in_axis in zip(
   1400     reversed(self.grouper.names),
   1401     reversed(self.grouper.get_group_levels()),
   (...)
   1404     # GH #28549
   1405     # When using .apply(-), name will be in columns already
   1406     if in_axis and name not in columns:
-> 1407         result.insert(0, name, lev)

File ~/miniconda3/lib/python3.8/site-packages/pandas/core/frame.py:4444, in DataFrame.insert(self, loc, column, value, allow_duplicates)
   4441 if not isinstance(loc, int):
   4442     raise TypeError("loc must be int")
-> 4444 value = self._sanitize_column(value)
   4445 self._mgr.insert(loc, column, value)

File ~/miniconda3/lib/python3.8/site-packages/pandas/core/frame.py:4535, in DataFrame._sanitize_column(self, value)
   4532     return _reindex_for_setitem(value, self.index)
   4534 if is_list_like(value):
-> 4535     com.require_length_match(value, self.index)
   4536 return sanitize_array(value, self.index, copy=True, allow_2d=True)

File ~/miniconda3/lib/python3.8/site-packages/pandas/core/common.py:557, in require_length_match(data, index)
    553 """
    554 Check the length of data matches the length of the index.
    555 """
    556 if len(data) != len(index):
--> 557     raise ValueError(
    558         "Length of values "
    559         f"({len(data)}) "
    560         "does not match length of index "
    561         f"({len(index)})"
    562     )

ValueError: Length of values (1) does not match length of index (9)

Expected behavior A clear and concise description of what you expected to happen.

Additional context Add any other context about the problem here.

hekaisheng avatar May 13 '22 15:05 hekaisheng

After digging into, found that this is a Pandas issue:

In [20]: a = pd.DataFrame({'a':['a','b', 'c'] * 5, 'b': ['d', 'e', 'f'] * 5, 'c': range(15)}).astype({'a': "category", "b": 'category'})

In [21]: a.groupby(['a', 'b'], as_index=False).agg({'c': 'sum'})
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Input In [21], in <cell line: 1>()
----> 1 a.groupby(['a', 'b'], as_index=False).agg({'c': 'sum'})

File ~/miniconda3/lib/python3.8/site-packages/pandas/core/groupby/generic.py:924, in DataFrameGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
    921                 result.columns = result.columns.droplevel(-1)
    923 if not self.as_index:
--> 924     self._insert_inaxis_grouper_inplace(result)
    925     result.index = Index(range(len(result)))
    927 return result

File ~/miniconda3/lib/python3.8/site-packages/pandas/core/groupby/generic.py:1407, in DataFrameGroupBy._insert_inaxis_grouper_inplace(self, result)
   1399 for name, lev, in_axis in zip(
   1400     reversed(self.grouper.names),
   1401     reversed(self.grouper.get_group_levels()),
   (...)
   1404     # GH #28549
   1405     # When using .apply(-), name will be in columns already
   1406     if in_axis and name not in columns:
-> 1407         result.insert(0, name, lev)

File ~/miniconda3/lib/python3.8/site-packages/pandas/core/frame.py:4444, in DataFrame.insert(self, loc, column, value, allow_duplicates)
   4441 if not isinstance(loc, int):
   4442     raise TypeError("loc must be int")
-> 4444 value = self._sanitize_column(value)
   4445 self._mgr.insert(loc, column, value)

File ~/miniconda3/lib/python3.8/site-packages/pandas/core/frame.py:4535, in DataFrame._sanitize_column(self, value)
   4532     return _reindex_for_setitem(value, self.index)
   4534 if is_list_like(value):
-> 4535     com.require_length_match(value, self.index)
   4536 return sanitize_array(value, self.index, copy=True, allow_2d=True)

File ~/miniconda3/lib/python3.8/site-packages/pandas/core/common.py:557, in require_length_match(data, index)
    553 """
    554 Check the length of data matches the length of the index.
    555 """
    556 if len(data) != len(index):
--> 557     raise ValueError(
    558         "Length of values "
    559         f"({len(data)}) "
    560         "does not match length of index "
    561         f"({len(index)})"
    562     )

ValueError: Length of values (3) does not match length of index (9)

And https://github.com/pandas-dev/pandas/issues/46492 gives a workaround.

hekaisheng avatar May 13 '22 16:05 hekaisheng