NVTabular
NVTabular copied to clipboard
[BUG] partition_on throws an error if not a list.
Describe the bug Small bug, but when saving a parquet file and using partition_on I believe it looks for index 0 regardless of its input, so a text saying partition_on = 'partition_index' gives a KeyError: 'p'. Simple workaround would be to just pass a list. partition_on = ['partition_index'].
Steps/Code to reproduce bug
import random
import pandas as pd
import nvtabular as nvt
from nvtabular.ops import *
from merlin.schema.tags import Tags
from merlin.io.dataset import Dataset
#create dummy data and workflow
columns = ['item_id','user_id','itemfeat','userfeat']
nrows = 4
df = pd.DataFrame({columns[x]:[random.randint(0,10) for _ in range(nrows)] for x in range(nrows)},index=range(nrows))
df['partition_index'] = [0]*(nrows//2) + [1]*(nrows//2)
# df['target'] = 1
# targets = ["target"] >> AddMetadata(tags=[Tags.BINARY_CLASSIFICATION,Tags.TARGET])
user_id = ['user_id'] >> AddMetadata(tags=[Tags.USER_ID]) >> Categorify()
item_id = ['item_id'] >> AddMetadata(tags=[Tags.ITEM_ID]) >> Categorify()
item_features = ['itemfeat'] >> AddMetadata(tags=[Tags.ITEM]) >> Categorify()
user_features = ['userfeat'] >> AddMetadata(tags=[Tags.USER]) >> Categorify()
part_idx = ['partition_index'] >> AddMetadata(tags=[Tags.CONTEXT])
outputs = user_id+item_id+item_features+user_features + part_idx
workflow = nvt.Workflow(outputs)
df = Dataset(df)
workflow.fit(df)
#save data, trigger error.
df = workflow.transform(df).to_parquet(
output_path = 'delete_me.parquet',
shuffle=False,
partition_on='partition_index'
# preserve_files = True
)
@angmc do you mind to check if this is still a bug or not? thanks!