mars
mars copied to clipboard
[BUG] `groupby` failed when using categorical columns with `as_index=False`
Describe the bug
groupby
failed when using categorical columns with as_index=False
.
To Reproduce
In [14]: a = pd.DataFrame({'a':['a','b', 'c'] * 5, 'b': ['d', 'e', 'f'] * 5, 'c': range(15)}).astype({'a': "category", "b": 'category'})
In [15]: df = md.DataFrame(a)
In [16]: df.groupby(['a', 'b'], as_index=False).agg({'c': 'sum'}).execute()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [16], in <cell line: 1>()
----> 1 df.groupby(['a', 'b'], as_index=False).agg({'c': 'sum'}).execute()
File ~/Documents/mars/mars/dataframe/groupby/aggregation.py:1308, in agg(groupby, func, method, combine_size, *args, **kwargs)
1298 use_inf_as_na = kwargs.pop("_use_inf_as_na", options.dataframe.mode.use_inf_as_na)
1299 agg_op = DataFrameGroupByAgg(
1300 raw_func=func,
1301 raw_func_kw=kwargs,
(...)
1306 use_inf_as_na=use_inf_as_na,
1307 )
-> 1308 return agg_op(groupby)
File ~/Documents/mars/mars/core/mode.py:77, in _EnterModeFuncWrapper.__call__.<locals>._inner(*args, **kwargs)
74 @functools.wraps(func)
75 def _inner(*args, **kwargs):
76 with enter_mode(**mode_name_to_value):
---> 77 return func(*args, **kwargs)
File ~/Documents/mars/mars/dataframe/groupby/aggregation.py:297, in DataFrameGroupByAgg.__call__(self, groupby)
290 self.output_types = (
291 [OutputType.dataframe]
292 if groupby.op.output_types[0] == OutputType.dataframe_groupby
293 else [OutputType.series]
294 )
296 if self.output_types[0] == OutputType.dataframe:
--> 297 return self._call_dataframe(groupby, df)
298 else:
299 return self._call_series(groupby, df)
File ~/Documents/mars/mars/dataframe/groupby/aggregation.py:224, in DataFrameGroupByAgg._call_dataframe(self, groupby, input_df)
223 def _call_dataframe(self, groupby, input_df):
--> 224 agg_df = build_mock_agg_result(
225 groupby, self.groupby_params, self.raw_func, **self.raw_func_kw
226 )
228 shape = (np.nan, agg_df.shape[1])
229 index_value = parse_index(agg_df.index, groupby.key, groupby.index_value.key)
File ~/Documents/mars/mars/dataframe/groupby/aggregation.py:141, in build_mock_agg_result(groupby, groupby_params, raw_func, **raw_func_kw)
134 def build_mock_agg_result(
135 groupby: GROUPBY_TYPE,
136 groupby_params: Dict,
137 raw_func: Callable,
138 **raw_func_kw,
139 ):
140 try:
--> 141 agg_result = groupby.op.build_mock_groupby().aggregate(raw_func, **raw_func_kw)
142 except ValueError:
143 if (
144 groupby_params.get("as_index") or _support_get_group_without_as_index
145 ): # pragma: no cover
File ~/miniconda3/lib/python3.8/site-packages/pandas/core/groupby/generic.py:924, in DataFrameGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
921 result.columns = result.columns.droplevel(-1)
923 if not self.as_index:
--> 924 self._insert_inaxis_grouper_inplace(result)
925 result.index = Index(range(len(result)))
927 return result
File ~/miniconda3/lib/python3.8/site-packages/pandas/core/groupby/generic.py:1407, in DataFrameGroupBy._insert_inaxis_grouper_inplace(self, result)
1399 for name, lev, in_axis in zip(
1400 reversed(self.grouper.names),
1401 reversed(self.grouper.get_group_levels()),
(...)
1404 # GH #28549
1405 # When using .apply(-), name will be in columns already
1406 if in_axis and name not in columns:
-> 1407 result.insert(0, name, lev)
File ~/miniconda3/lib/python3.8/site-packages/pandas/core/frame.py:4444, in DataFrame.insert(self, loc, column, value, allow_duplicates)
4441 if not isinstance(loc, int):
4442 raise TypeError("loc must be int")
-> 4444 value = self._sanitize_column(value)
4445 self._mgr.insert(loc, column, value)
File ~/miniconda3/lib/python3.8/site-packages/pandas/core/frame.py:4535, in DataFrame._sanitize_column(self, value)
4532 return _reindex_for_setitem(value, self.index)
4534 if is_list_like(value):
-> 4535 com.require_length_match(value, self.index)
4536 return sanitize_array(value, self.index, copy=True, allow_2d=True)
File ~/miniconda3/lib/python3.8/site-packages/pandas/core/common.py:557, in require_length_match(data, index)
553 """
554 Check the length of data matches the length of the index.
555 """
556 if len(data) != len(index):
--> 557 raise ValueError(
558 "Length of values "
559 f"({len(data)}) "
560 "does not match length of index "
561 f"({len(index)})"
562 )
ValueError: Length of values (1) does not match length of index (9)
Expected behavior A clear and concise description of what you expected to happen.
Additional context Add any other context about the problem here.
After digging into, found that this is a Pandas issue:
In [20]: a = pd.DataFrame({'a':['a','b', 'c'] * 5, 'b': ['d', 'e', 'f'] * 5, 'c': range(15)}).astype({'a': "category", "b": 'category'})
In [21]: a.groupby(['a', 'b'], as_index=False).agg({'c': 'sum'})
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [21], in <cell line: 1>()
----> 1 a.groupby(['a', 'b'], as_index=False).agg({'c': 'sum'})
File ~/miniconda3/lib/python3.8/site-packages/pandas/core/groupby/generic.py:924, in DataFrameGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
921 result.columns = result.columns.droplevel(-1)
923 if not self.as_index:
--> 924 self._insert_inaxis_grouper_inplace(result)
925 result.index = Index(range(len(result)))
927 return result
File ~/miniconda3/lib/python3.8/site-packages/pandas/core/groupby/generic.py:1407, in DataFrameGroupBy._insert_inaxis_grouper_inplace(self, result)
1399 for name, lev, in_axis in zip(
1400 reversed(self.grouper.names),
1401 reversed(self.grouper.get_group_levels()),
(...)
1404 # GH #28549
1405 # When using .apply(-), name will be in columns already
1406 if in_axis and name not in columns:
-> 1407 result.insert(0, name, lev)
File ~/miniconda3/lib/python3.8/site-packages/pandas/core/frame.py:4444, in DataFrame.insert(self, loc, column, value, allow_duplicates)
4441 if not isinstance(loc, int):
4442 raise TypeError("loc must be int")
-> 4444 value = self._sanitize_column(value)
4445 self._mgr.insert(loc, column, value)
File ~/miniconda3/lib/python3.8/site-packages/pandas/core/frame.py:4535, in DataFrame._sanitize_column(self, value)
4532 return _reindex_for_setitem(value, self.index)
4534 if is_list_like(value):
-> 4535 com.require_length_match(value, self.index)
4536 return sanitize_array(value, self.index, copy=True, allow_2d=True)
File ~/miniconda3/lib/python3.8/site-packages/pandas/core/common.py:557, in require_length_match(data, index)
553 """
554 Check the length of data matches the length of the index.
555 """
556 if len(data) != len(index):
--> 557 raise ValueError(
558 "Length of values "
559 f"({len(data)}) "
560 "does not match length of index "
561 f"({len(index)})"
562 )
ValueError: Length of values (3) does not match length of index (9)
And https://github.com/pandas-dev/pandas/issues/46492 gives a workaround.