SDV
SDV copied to clipboard
Small relational dataset example taking too long
- SDV version: 0.4.5
- Python version: 3.8
- Operating System: Unbutu 20.04 (Docker container)
I'm trying to create a HMA1 model with the Pubs dataset (https://relational.fit.cvut.cz/dataset/Pubs), but the fitting method is taking too long (over 30 min). I limited the tables to a maximum of 100 rows and the number of columns vary from 3 to 10. I saved the Metadata in json format to make it easier to reproduce.
Here is the metadata.json:
{
"tables": {
"authors": {
"fields": {
"phone": {
"type": "categorical",
"pii": "True",
"pii_category": "phone_number"
},
"zip": {
"type": "numerical",
"subtype": "integer"
},
"city": {
"type": "categorical",
"pii": "True",
"pii_category": "address"
},
"au_id": {
"type": "id",
"subtype": "string"
},
"contract": {
"type": "numerical",
"subtype": "integer"
},
"au_lname": {
"type": "categorical",
"pii": "True",
"pii_category": "address"
},
"au_fname": {
"type": "categorical",
"pii": "True",
"pii_category": "last_name"
},
"address": {
"type": "categorical",
"pii": "True",
"pii_category": "address"
},
"state": {
"type": "categorical"
}
},
"primary_key": "au_id"
},
"discounts": {
"fields": {
"discounttype": {
"type": "categorical"
},
"highqty": {
"type": "numerical",
"subtype": "float"
},
"discount": {
"type": "numerical",
"subtype": "float"
},
"lowqty": {
"type": "numerical",
"subtype": "float"
},
"stor_id": {
"type": "numerical",
"subtype": "float"
}
}
},
"employee": {
"fields": {
"job_lvl": {
"type": "numerical",
"subtype": "integer"
},
"fname": {
"type": "categorical",
"pii": "True",
"pii_category": "first_name"
},
"hire_date": {
"type": "categorical",
"pii": "True",
"pii_category": "date"
},
"lname": {
"type": "categorical",
"pii": "True",
"pii_category": "last_name"
},
"minit": {
"type": "categorical"
},
"pub_id": {
"type": "id",
"subtype": "integer",
"ref": {
"table": "publishers",
"field": "pub_id"
}
},
"job_id": {
"type": "id",
"subtype": "integer",
"ref": {
"table": "jobs",
"field": "job_id"
}
},
"emp_id": {
"type": "categorical"
}
}
},
"jobs": {
"fields": {
"job_id": {
"type": "id",
"subtype": "integer"
},
"min_lvl": {
"type": "numerical",
"subtype": "integer"
},
"job_desc": {
"type": "categorical"
},
"max_lvl": {
"type": "categorical",
"pii": "True",
"pii_category": "first_name"
}
},
"primary_key": "job_id"
},
"pub_info": {
"fields": {
"logo": {
"type": "categorical"
},
"pub_id": {
"type": "id",
"subtype": "integer",
"ref": {
"table": "publishers",
"field": "pub_id"
}
},
"pr_info": {
"type": "categorical",
"pii": "True",
"pii_category": "address"
}
}
},
"publishers": {
"fields": {
"pub_name": {
"type": "categorical",
"pii": "True",
"pii_category": "first_name"
},
"city": {
"type": "categorical",
"pii": "True",
"pii_category": "address"
},
"pub_id": {
"type": "id",
"subtype": "integer"
},
"country": {
"type": "categorical",
"pii": "True",
"pii_category": "address"
},
"state": {
"type": "categorical",
"pii": "True",
"pii_category": "address"
}
},
"primary_key": "pub_id"
},
"roysched": {
"fields": {
"hirange": {
"type": "numerical",
"subtype": "integer"
},
"royalty": {
"type": "numerical",
"subtype": "integer"
},
"lorange": {
"type": "numerical",
"subtype": "integer"
},
"title_id": {
"type": "id",
"subtype": "string",
"ref": {
"table": "titles",
"field": "title_id"
}
}
}
},
"sales": {
"fields": {
"ord_date": {
"type": "categorical",
"pii": "True",
"pii_category": "date"
},
"payterms": {
"type": "categorical",
"pii": "True",
"pii_category": "last_name"
},
"ord_num": {
"type": "categorical",
"pii": "True",
"pii_category": "first_name"
},
"title_id": {
"type": "id",
"subtype": "string",
"ref": {
"table": "titles",
"field": "title_id"
}
},
"stor_id": {
"type": "id",
"subtype": "integer",
"ref": {
"table": "stores",
"field": "stor_id"
}
},
"qty": {
"type": "numerical",
"subtype": "integer"
}
}
},
"stores": {
"fields": {
"zip": {
"type": "numerical",
"subtype": "integer"
},
"city": {
"type": "categorical",
"pii": "True",
"pii_category": "address"
},
"stor_address": {
"type": "categorical",
"pii": "True",
"pii_category": "last_name"
},
"stor_name": {
"type": "categorical",
"pii": "True",
"pii_category": "first_name"
},
"stor_id": {
"type": "id",
"subtype": "integer"
},
"state": {
"type": "categorical"
}
},
"primary_key": "stor_id"
},
"titleauthor": {
"fields": {
"au_id": {
"type": "id",
"subtype": "string",
"ref": {
"table": "authors",
"field": "au_id"
}
},
"au_ord": {
"type": "numerical",
"subtype": "integer"
},
"royaltyper": {
"type": "numerical",
"subtype": "integer"
},
"title_id": {
"type": "id",
"subtype": "string",
"ref": {
"table": "titles",
"field": "title_id"
}
}
}
},
"titles": {
"fields": {
"price": {
"type": "numerical",
"subtype": "float"
},
"advance": {
"type": "categorical",
"pii": "True",
"pii_category": "city"
},
"ytd_sales": {
"type": "numerical",
"subtype": "float"
},
"pub_id": {
"type": "id",
"subtype": "integer",
"ref": {
"table": "publishers",
"field": "pub_id"
}
},
"title_id": {
"type": "id",
"subtype": "string"
},
"notes": {
"type": "categorical"
},
"pubdate": {
"type": "categorical",
"pii": "True",
"pii_category": "date"
},
"royalty": {
"type": "numerical",
"subtype": "float"
},
"type": {
"type": "categorical"
},
"title": {
"type": "categorical"
}
},
"primary_key": "title_id"
}
}
}
Heres the code I'm using:
import time
import json
import glob
from pathlib import Path
import pandas as pd
import sdv
from sdv import Metadata
from sdv.relational import HMA1
ROWS_LIM = 10
def load_tables(folder):
table_paths = glob.glob(folder + "*.csv")
table_dict = {}
for path in table_paths:
table_name = Path(path).name.replace('.csv', '')
table_df = pd.read_csv(path)
if len(table_df) > ROWS_LIM:
table_df = table_df.iloc[:ROWS_LIM, :]
table_dict[table_name] = table_df
metadata_json = json.load(open(folder + 'metadata/metadata.json', 'r'))
return table_dict, metadata_json
def multi_table_gen(folder, model='HMA1', sample_size=5):
tables, metadata_json = load_tables(folder)
for table in tables.values():
print(table.values.shape)
metadata = Metadata(metadata_json)
print("Creating model")
start_time = time.time()
model = HMA1(metadata)
print(f"Model created. Duration: {time.time() - start_time}")
print("Fitting model")
start_time = time.time()
model.fit(tables)
print(f"Model fitted. Duration: {time.time() - start_time}")
print("Getting sample")
start_time = time.time()
sample = model.sample()
print(f"Sample generated. Duration: {time.time() - start_time}")
print("Replace line break")
start_time = time.time()
for key in sample.keys():
sample[key] = sample[key].replace({'\n': ' '}, regex=True)
print(f"Line break finished. Duration: {time.time() - start_time}")
return sample
folder = 'SDV_datasets/pubs_sample/'
sample = multi_table_gen(folder)
for key in sample:
sample[key].to_csv(folder + 'output/' + key + '_synthetic.csv')
I even tried a smaller example, with up to 10 rows for each table, but it is also taking a lot of time. What can I change to make this run faster?
Thank you for your attention
Update, when running the smaller example (with up to 10 rows), I got the following:
MemoryError: Unable to allocate 14.7 GiB for an array with shape (44413, 44413) and data type float64
These are the shapes of the csvs being used for this case (rows, columns): (10, 10), (3, 5), (10, 6), (8, 3), (10, 4), (10, 9), (10, 8), (10, 4), (8, 5), (10, 4), (6, 6)
I'm clearly doing something wrong here.
Hi @msgonzaga
I don't think you are doing anything wrong. It's just a combination of two things: (1) there are a lot of nested levels, which makes the problem big and (2) the default parameters from SDV 0.4.5 are not optimal for such scenarios.
I just made a release candidate for v0.5.0 a few minutes ago which should be able to handle this better. Would you mind installing it and trying again?
To install it run pip install -U --pre sdv
and then make sure that you have sdv==0.5.0.dev0
installed.
I will also try to reproduce this on my side to explore it a bit more.
Hi @csala, thanks for your attention.
I tried using the sdv==0.5.0.dev0
version and unfortunately got an error:
Traceback (most recent call last):
File "example_sdv.py", line 59, in <module>
sample = multi_table_gen(folder)
File "example_sdv.py", line 41, in multi_table_gen
model.fit(tables)
File "/usr/local/lib/python3.8/dist-packages/sdv/relational/base.py", line 63, in fit
self._fit(tables)
File "/usr/local/lib/python3.8/dist-packages/sdv/relational/hma.py", line 199, in _fit
self._model_table(table_name, tables)
File "/usr/local/lib/python3.8/dist-packages/sdv/relational/hma.py", line 164, in _model_table
table = self._extend_table(table, tables, table_name)
File "/usr/local/lib/python3.8/dist-packages/sdv/relational/hma.py", line 103, in _extend_table
extension = self._get_extension(child_name, child_table, child_key)
File "/usr/local/lib/python3.8/dist-packages/sdv/relational/hma.py", line 90, in _get_extension
model.fit(child_rows.reset_index(drop=True))
File "/usr/local/lib/python3.8/dist-packages/sdv/tabular/base.py", line 112, in fit
transformed = self._metadata.transform(data)
File "/usr/local/lib/python3.8/dist-packages/sdv/metadata/table.py", line 515, in transform
return self._hyper_transformer.transform(data)
File "/usr/local/lib/python3.8/dist-packages/rdt/hyper_transformer.py", line 155, in transform
transformed = transformer.transform(column)
File "/usr/local/lib/python3.8/dist-packages/rdt/transformers/categorical.py", line 174, in transform
return data.fillna(np.nan).apply(self._get_value).to_numpy()
File "/usr/local/lib/python3.8/dist-packages/pandas/core/series.py", line 4212, in apply
mapped = lib.map_infer(values, f, convert=convert_dtype)
File "pandas/_libs/lib.pyx", line 2403, in pandas._libs.lib.map_infer
File "/usr/local/lib/python3.8/dist-packages/rdt/transformers/categorical.py", line 147, in _get_value
mean, std = self.intervals[category][2:]
KeyError: nan
Update: I tried it again with the new 0.5.0 stable version and apparently got an error in the reverse_transform operation:
Traceback (most recent call last):
File "example_sdv.py", line 59, in <module>
sample = multi_table_gen(folder)
File "example_sdv.py", line 46, in multi_table_gen
sample = model.sample()
File "/usr/local/lib/python3.8/dist-packages/sdv/relational/base.py", line 186, in sample
return self._sample(table_name, num_rows, sample_children)
File "/usr/local/lib/python3.8/dist-packages/sdv/relational/hma.py", line 408, in _sample
sampled = self._sample_table(table, num_rows)
File "/usr/local/lib/python3.8/dist-packages/sdv/relational/hma.py", line 371, in _sample_table
return self._finalize(sampled_data)
File "/usr/local/lib/python3.8/dist-packages/sdv/relational/hma.py", line 229, in _finalize
parent_ids = self._find_parent_ids(table_name, parent_name, sampled_data)
File "/usr/local/lib/python3.8/dist-packages/sdv/relational/hma.py", line 348, in _find_parent_ids
parent_rows = self._sample_rows(parent_model, parent_name, num_parent_rows)
File "/usr/local/lib/python3.8/dist-packages/sdv/relational/hma.py", line 268, in _sample_rows
sampled = model.sample(num_rows)
File "/usr/local/lib/python3.8/dist-packages/sdv/tabular/base.py", line 153, in sample
sampled = self._metadata.reverse_transform(sampled)
File "/usr/local/lib/python3.8/dist-packages/sdv/metadata/table.py", line 570, in reverse_transform
reversed_data[name] = field_data[field_data.notnull()].astype(self._dtypes[name])
File "/usr/local/lib/python3.8/dist-packages/pandas/core/generic.py", line 5548, in astype
new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,)
File "/usr/local/lib/python3.8/dist-packages/pandas/core/internals/managers.py", line 604, in astype
return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
File "/usr/local/lib/python3.8/dist-packages/pandas/core/internals/managers.py", line 409, in apply
applied = getattr(b, f)(**kwargs)
File "/usr/local/lib/python3.8/dist-packages/pandas/core/internals/blocks.py", line 595, in astype
values = astype_nansafe(vals1d, dtype, copy=True)
File "/usr/local/lib/python3.8/dist-packages/pandas/core/dtypes/cast.py", line 997, in astype_nansafe
return arr.astype(dtype, copy=True)
ValueError: could not convert string to float: 'East Hannah'
Hello @msgonzaga , we managed to reproduce the following error that you had using your metadata
and generating some dummy values. We will keep you updated on it once we find what causes this problem.
Traceback (most recent call last): File "example_sdv.py", line 59, in <module> sample = multi_table_gen(folder) File "example_sdv.py", line 41, in multi_table_gen model.fit(tables) File "/usr/local/lib/python3.8/dist-packages/sdv/relational/base.py", line 63, in fit self._fit(tables) File "/usr/local/lib/python3.8/dist-packages/sdv/relational/hma.py", line 199, in _fit self._model_table(table_name, tables) File "/usr/local/lib/python3.8/dist-packages/sdv/relational/hma.py", line 164, in _model_table table = self._extend_table(table, tables, table_name) File "/usr/local/lib/python3.8/dist-packages/sdv/relational/hma.py", line 103, in _extend_table extension = self._get_extension(child_name, child_table, child_key) File "/usr/local/lib/python3.8/dist-packages/sdv/relational/hma.py", line 90, in _get_extension model.fit(child_rows.reset_index(drop=True)) File "/usr/local/lib/python3.8/dist-packages/sdv/tabular/base.py", line 112, in fit transformed = self._metadata.transform(data) File "/usr/local/lib/python3.8/dist-packages/sdv/metadata/table.py", line 515, in transform return self._hyper_transformer.transform(data) File "/usr/local/lib/python3.8/dist-packages/rdt/hyper_transformer.py", line 155, in transform transformed = transformer.transform(column) File "/usr/local/lib/python3.8/dist-packages/rdt/transformers/categorical.py", line 174, in transform return data.fillna(np.nan).apply(self._get_value).to_numpy() File "/usr/local/lib/python3.8/dist-packages/pandas/core/series.py", line 4212, in apply mapped = lib.map_infer(values, f, convert=convert_dtype) File "pandas/_libs/lib.pyx", line 2403, in pandas._libs.lib.map_infer File "/usr/local/lib/python3.8/dist-packages/rdt/transformers/categorical.py", line 147, in _get_value mean, std = self.intervals[category][2:] KeyError: nan