siuba
siuba copied to clipboard
Bug: gather does not select anything if `*args` is unspecified
The docs for gather state for *args:
If unspecified, all columns are selected.
However, if not *args are specified, nothing is selected.
Code to reproduce this:
# %%
import pandas as pd
from siuba import gather
# %%
df = pd.DataFrame(
{
"id": [1, 2],
"price_x": [0.1, 0.2],
"price_y": [0.4, 0.5],
"price_z": [0.7, 0.8],
}
)
print(df)
# id price_x price_y price_z
# 0 1 0.1 0.4 0.7
# 1 2 0.2 0.5 0.8
# %%
res = df >> gather()
print(res)
# Empty DataFrame
# Columns: [id, price_x, price_y, price_z, key, value]
# Index: []
This would be the fix: var_list = var_create(*args or __data.columns).
And with some context and a doctest:
@singledispatch2(pd.DataFrame)
def gather(__data, key, value, *args, drop_na=False, convert=False):
"""Reshape table by gathering it in to long format.
Examples
--------
>>> import pandas as pd
>>> from siuba import _, gather
>>> df = pd.DataFrame({"x": [1, 2], "y": [3, None]})
>>> gather(df, key="key", value="value")
key value
0 x 1.0
1 x 2.0
2 y 3.0
3 y NaN
"""
# TODO: implement var selection over *args
if convert:
raise NotImplementedError("convert not yet implemented")
# TODO: copied from nest and select
var_list = var_create(*args or __data.columns) # <-- this is the fix
od = var_select(__data.columns, *var_list)
value_vars = list(od) or None
id_vars = [col for col in __data.columns if col not in od]
long = pd.melt(__data, id_vars, value_vars, key, value)
if drop_na:
return long[~long[value].isna()].reset_index(drop=True)
return long
However there is a TODO and maybe you want to implement a more systematic solution.
Should be fixed in v0.4.0!