woodwork
woodwork copied to clipboard
Manipulating pandas dataframe doubles length of pandas column in Woodwork 0.16.0 (with string[arrow] dtype)
In the latest Woodwork release manipulating the top level pandas dataframe after initializing woodwork duplicates the column and the column doubles in length. See here and specifically test_simple_imputer_ignores_natural_language
to see the EvalML failure.
import woodwork
import pandas as pd
import numpy as np
cols = {
"dates": pd.date_range("01-01-2022", periods=20),
"categorical col": pd.Series(
["zero", "one", "two", "zero", "two"] * 4, dtype="category"
),
"int col": [0, 1, 2, 0, 3] * 4,
"object col": ["b", "b", "a", "c", "d"] * 4,
"float col": [0.0, 1.0, 0.0, -2.0, 5.0] * 4,
"bool col": [True, False, False, True, True] * 4,
"categorical with nan": pd.Series(
[np.nan, "1", "0", "0", "3"] * 4, dtype="category"
),
"int with nan": [np.nan, 1, 0, 0, 1] * 4,
"float with nan": [0.0, 1.0, np.nan, -1.0, 0.0] * 4,
"object with nan": ["b", "b", np.nan, "c", np.nan] * 4,
"bool col with nan": pd.Series(
[True, np.nan, False, np.nan, True] * 4, dtype="category"
),
"all nan": [np.nan, np.nan, np.nan, np.nan, np.nan] * 4,
"all nan cat": pd.Series(
[np.nan, np.nan, np.nan, np.nan, np.nan] * 4, dtype="category"
),
"natural language col": pd.Series(
["cats are really great", "don't", "believe", "me?", "well..."] * 4,
dtype="string",
),
}
df = pd.DataFrame(cols)
df_1 = pd.DataFrame(cols)
df_1.ww.init()
df.iloc[-1, :] = None
df_1.iloc[-1, :] = None
# this assertion fails with string[arrow] dtype in Woodwork
assert df['natural language col'] != df_1['natural language col']
Issue has been cancelled.
Pandas 1.5 should solve this bug!