SDV icon indicating copy to clipboard operation
SDV copied to clipboard

Small relational dataset example taking too long

Open msgonzaga opened this issue 3 years ago • 5 comments

  • SDV version: 0.4.5
  • Python version: 3.8
  • Operating System: Unbutu 20.04 (Docker container)

I'm trying to create a HMA1 model with the Pubs dataset (https://relational.fit.cvut.cz/dataset/Pubs), but the fitting method is taking too long (over 30 min). I limited the tables to a maximum of 100 rows and the number of columns vary from 3 to 10. I saved the Metadata in json format to make it easier to reproduce.

Here is the metadata.json:

{
	"tables": {
		"authors": {
			"fields": {
				"phone": {
					"type": "categorical",
					"pii": "True",
					"pii_category": "phone_number"
				},
				"zip": {
					"type": "numerical",
					"subtype": "integer"
				},
				"city": {
					"type": "categorical",
					"pii": "True",
					"pii_category": "address"
				},
				"au_id": {
					"type": "id",
					"subtype": "string"
				},
				"contract": {
					"type": "numerical",
					"subtype": "integer"
				},
				"au_lname": {
					"type": "categorical",
					"pii": "True",
					"pii_category": "address"
				},
				"au_fname": {
					"type": "categorical",
					"pii": "True",
					"pii_category": "last_name"
				},
				"address": {
					"type": "categorical",
					"pii": "True",
					"pii_category": "address"
				},
				"state": {
					"type": "categorical"
				}
			},
			"primary_key": "au_id"
		},
		"discounts": {
			"fields": {
				"discounttype": {
					"type": "categorical"
				},
				"highqty": {
					"type": "numerical",
					"subtype": "float"
				},
				"discount": {
					"type": "numerical",
					"subtype": "float"
				},
				"lowqty": {
					"type": "numerical",
					"subtype": "float"
				},
				"stor_id": {
					"type": "numerical",
					"subtype": "float"
				}
			}
		},
		"employee": {
			"fields": {
				"job_lvl": {
					"type": "numerical",
					"subtype": "integer"
				},
				"fname": {
					"type": "categorical",
					"pii": "True",
					"pii_category": "first_name"
				},
				"hire_date": {
					"type": "categorical",
					"pii": "True",
					"pii_category": "date"
				},
				"lname": {
					"type": "categorical",
					"pii": "True",
					"pii_category": "last_name"
				},
				"minit": {
					"type": "categorical"
				},
				"pub_id": {
					"type": "id",
					"subtype": "integer",
					"ref": {
						"table": "publishers",
						"field": "pub_id"
					}
				},
				"job_id": {
					"type": "id",
					"subtype": "integer",
					"ref": {
						"table": "jobs",
						"field": "job_id"
					}
				},
				"emp_id": {
					"type": "categorical"
				}
			}
		},
		"jobs": {
			"fields": {
				"job_id": {
					"type": "id",
					"subtype": "integer"
				},
				"min_lvl": {
					"type": "numerical",
					"subtype": "integer"
				},
				"job_desc": {
					"type": "categorical"
				},
				"max_lvl": {
					"type": "categorical",
					"pii": "True",
					"pii_category": "first_name"
				}
			},
			"primary_key": "job_id"
		},
		"pub_info": {
			"fields": {
				"logo": {
					"type": "categorical"
				},
				"pub_id": {
					"type": "id",
					"subtype": "integer",
					"ref": {
						"table": "publishers",
						"field": "pub_id"
					}
				},
				"pr_info": {
					"type": "categorical",
					"pii": "True",
					"pii_category": "address"
				}
			}
		},
		"publishers": {
			"fields": {
				"pub_name": {
					"type": "categorical",
					"pii": "True",
					"pii_category": "first_name"
				},
				"city": {
					"type": "categorical",
					"pii": "True",
					"pii_category": "address"
				},
				"pub_id": {
					"type": "id",
					"subtype": "integer"
				},
				"country": {
					"type": "categorical",
					"pii": "True",
					"pii_category": "address"
				},
				"state": {
					"type": "categorical",
					"pii": "True",
					"pii_category": "address"
				}
			},
			"primary_key": "pub_id"
		},
		"roysched": {
			"fields": {
				"hirange": {
					"type": "numerical",
					"subtype": "integer"
				},
				"royalty": {
					"type": "numerical",
					"subtype": "integer"
				},
				"lorange": {
					"type": "numerical",
					"subtype": "integer"
				},
				"title_id": {
					"type": "id",
					"subtype": "string",
					"ref": {
						"table": "titles",
						"field": "title_id"
					}
				}
			}
		},
		"sales": {
			"fields": {
				"ord_date": {
					"type": "categorical",
					"pii": "True",
					"pii_category": "date"
				},
				"payterms": {
					"type": "categorical",
					"pii": "True",
					"pii_category": "last_name"
				},
				"ord_num": {
					"type": "categorical",
					"pii": "True",
					"pii_category": "first_name"
				},
				"title_id": {
					"type": "id",
					"subtype": "string",
					"ref": {
						"table": "titles",
						"field": "title_id"
					}
				},
				"stor_id": {
					"type": "id",
					"subtype": "integer",
					"ref": {
						"table": "stores",
						"field": "stor_id"
					}
				},
				"qty": {
					"type": "numerical",
					"subtype": "integer"
				}
			}
		},
		"stores": {
			"fields": {
				"zip": {
					"type": "numerical",
					"subtype": "integer"
				},
				"city": {
					"type": "categorical",
					"pii": "True",
					"pii_category": "address"
				},
				"stor_address": {
					"type": "categorical",
					"pii": "True",
					"pii_category": "last_name"
				},
				"stor_name": {
					"type": "categorical",
					"pii": "True",
					"pii_category": "first_name"
				},
				"stor_id": {
					"type": "id",
					"subtype": "integer"
				},
				"state": {
					"type": "categorical"
				}
			},
			"primary_key": "stor_id"
		},
		"titleauthor": {
			"fields": {
				"au_id": {
					"type": "id",
					"subtype": "string",
					"ref": {
						"table": "authors",
						"field": "au_id"
					}
				},
				"au_ord": {
					"type": "numerical",
					"subtype": "integer"
				},
				"royaltyper": {
					"type": "numerical",
					"subtype": "integer"
				},
				"title_id": {
					"type": "id",
					"subtype": "string",
					"ref": {
						"table": "titles",
						"field": "title_id"
					}
				}
			}
		},
		"titles": {
			"fields": {
				"price": {
					"type": "numerical",
					"subtype": "float"
				},
				"advance": {
					"type": "categorical",
					"pii": "True",
					"pii_category": "city"
				},
				"ytd_sales": {
					"type": "numerical",
					"subtype": "float"
				},
				"pub_id": {
					"type": "id",
					"subtype": "integer",
					"ref": {
						"table": "publishers",
						"field": "pub_id"
					}
				},
				"title_id": {
					"type": "id",
					"subtype": "string"
				},
				"notes": {
					"type": "categorical"
				},
				"pubdate": {
					"type": "categorical",
					"pii": "True",
					"pii_category": "date"
				},
				"royalty": {
					"type": "numerical",
					"subtype": "float"
				},
				"type": {
					"type": "categorical"
				},
				"title": {
					"type": "categorical"
				}
			},
			"primary_key": "title_id"
		}
	}
}

Heres the code I'm using:

import time
import json
import glob
from pathlib import Path
import pandas as pd
import sdv
from sdv import Metadata
from sdv.relational import HMA1

ROWS_LIM = 10

def load_tables(folder):
    table_paths = glob.glob(folder + "*.csv")
    table_dict = {}
    for path in table_paths:
        table_name = Path(path).name.replace('.csv', '')
        table_df = pd.read_csv(path)
        if len(table_df) > ROWS_LIM:
            table_df = table_df.iloc[:ROWS_LIM, :]
        table_dict[table_name] = table_df
    metadata_json = json.load(open(folder + 'metadata/metadata.json', 'r'))
    return table_dict, metadata_json


def multi_table_gen(folder, model='HMA1', sample_size=5):
    tables, metadata_json = load_tables(folder)
    for table in tables.values():
        print(table.values.shape)
    metadata = Metadata(metadata_json)

    print("Creating model")
    start_time = time.time()
    model = HMA1(metadata)
    print(f"Model created. Duration: {time.time() - start_time}")

    print("Fitting model")
    start_time = time.time()
    model.fit(tables)
    print(f"Model fitted. Duration: {time.time() - start_time}")

    print("Getting sample")
    start_time = time.time()
    sample = model.sample()
    print(f"Sample generated. Duration: {time.time() - start_time}")

    print("Replace line break")
    start_time = time.time()
    for key in sample.keys():
        sample[key] = sample[key].replace({'\n': ' '}, regex=True)
    print(f"Line break finished. Duration: {time.time() - start_time}")

    return sample

folder = 'SDV_datasets/pubs_sample/'
sample = multi_table_gen(folder)
for key in sample:
    sample[key].to_csv(folder + 'output/' + key + '_synthetic.csv')

I even tried a smaller example, with up to 10 rows for each table, but it is also taking a lot of time. What can I change to make this run faster?

Thank you for your attention

msgonzaga avatar Nov 25 '20 19:11 msgonzaga

Update, when running the smaller example (with up to 10 rows), I got the following: MemoryError: Unable to allocate 14.7 GiB for an array with shape (44413, 44413) and data type float64

These are the shapes of the csvs being used for this case (rows, columns): (10, 10), (3, 5), (10, 6), (8, 3), (10, 4), (10, 9), (10, 8), (10, 4), (8, 5), (10, 4), (6, 6)

I'm clearly doing something wrong here.

msgonzaga avatar Nov 25 '20 19:11 msgonzaga

Hi @msgonzaga

I don't think you are doing anything wrong. It's just a combination of two things: (1) there are a lot of nested levels, which makes the problem big and (2) the default parameters from SDV 0.4.5 are not optimal for such scenarios.

I just made a release candidate for v0.5.0 a few minutes ago which should be able to handle this better. Would you mind installing it and trying again?

To install it run pip install -U --pre sdv and then make sure that you have sdv==0.5.0.dev0 installed.

I will also try to reproduce this on my side to explore it a bit more.

csala avatar Nov 25 '20 20:11 csala

Hi @csala, thanks for your attention.

I tried using the sdv==0.5.0.dev0 version and unfortunately got an error:

Traceback (most recent call last):
  File "example_sdv.py", line 59, in <module>
    sample = multi_table_gen(folder)
  File "example_sdv.py", line 41, in multi_table_gen
    model.fit(tables)
  File "/usr/local/lib/python3.8/dist-packages/sdv/relational/base.py", line 63, in fit
    self._fit(tables)
  File "/usr/local/lib/python3.8/dist-packages/sdv/relational/hma.py", line 199, in _fit
    self._model_table(table_name, tables)
  File "/usr/local/lib/python3.8/dist-packages/sdv/relational/hma.py", line 164, in _model_table
    table = self._extend_table(table, tables, table_name)
  File "/usr/local/lib/python3.8/dist-packages/sdv/relational/hma.py", line 103, in _extend_table
    extension = self._get_extension(child_name, child_table, child_key)
  File "/usr/local/lib/python3.8/dist-packages/sdv/relational/hma.py", line 90, in _get_extension
    model.fit(child_rows.reset_index(drop=True))
  File "/usr/local/lib/python3.8/dist-packages/sdv/tabular/base.py", line 112, in fit
    transformed = self._metadata.transform(data)
  File "/usr/local/lib/python3.8/dist-packages/sdv/metadata/table.py", line 515, in transform
    return self._hyper_transformer.transform(data)
  File "/usr/local/lib/python3.8/dist-packages/rdt/hyper_transformer.py", line 155, in transform
    transformed = transformer.transform(column)
  File "/usr/local/lib/python3.8/dist-packages/rdt/transformers/categorical.py", line 174, in transform
    return data.fillna(np.nan).apply(self._get_value).to_numpy()
  File "/usr/local/lib/python3.8/dist-packages/pandas/core/series.py", line 4212, in apply
    mapped = lib.map_infer(values, f, convert=convert_dtype)
  File "pandas/_libs/lib.pyx", line 2403, in pandas._libs.lib.map_infer
  File "/usr/local/lib/python3.8/dist-packages/rdt/transformers/categorical.py", line 147, in _get_value
    mean, std = self.intervals[category][2:]
KeyError: nan

msgonzaga avatar Nov 26 '20 18:11 msgonzaga

Update: I tried it again with the new 0.5.0 stable version and apparently got an error in the reverse_transform operation:

Traceback (most recent call last):
  File "example_sdv.py", line 59, in <module>
    sample = multi_table_gen(folder)
  File "example_sdv.py", line 46, in multi_table_gen
    sample = model.sample()
  File "/usr/local/lib/python3.8/dist-packages/sdv/relational/base.py", line 186, in sample
    return self._sample(table_name, num_rows, sample_children)
  File "/usr/local/lib/python3.8/dist-packages/sdv/relational/hma.py", line 408, in _sample
    sampled = self._sample_table(table, num_rows)
  File "/usr/local/lib/python3.8/dist-packages/sdv/relational/hma.py", line 371, in _sample_table
    return self._finalize(sampled_data)
  File "/usr/local/lib/python3.8/dist-packages/sdv/relational/hma.py", line 229, in _finalize
    parent_ids = self._find_parent_ids(table_name, parent_name, sampled_data)
  File "/usr/local/lib/python3.8/dist-packages/sdv/relational/hma.py", line 348, in _find_parent_ids
    parent_rows = self._sample_rows(parent_model, parent_name, num_parent_rows)
  File "/usr/local/lib/python3.8/dist-packages/sdv/relational/hma.py", line 268, in _sample_rows
    sampled = model.sample(num_rows)
  File "/usr/local/lib/python3.8/dist-packages/sdv/tabular/base.py", line 153, in sample
    sampled = self._metadata.reverse_transform(sampled)
  File "/usr/local/lib/python3.8/dist-packages/sdv/metadata/table.py", line 570, in reverse_transform
    reversed_data[name] = field_data[field_data.notnull()].astype(self._dtypes[name])
  File "/usr/local/lib/python3.8/dist-packages/pandas/core/generic.py", line 5548, in astype
    new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,)
  File "/usr/local/lib/python3.8/dist-packages/pandas/core/internals/managers.py", line 604, in astype
    return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
  File "/usr/local/lib/python3.8/dist-packages/pandas/core/internals/managers.py", line 409, in apply
    applied = getattr(b, f)(**kwargs)
  File "/usr/local/lib/python3.8/dist-packages/pandas/core/internals/blocks.py", line 595, in astype
    values = astype_nansafe(vals1d, dtype, copy=True)
  File "/usr/local/lib/python3.8/dist-packages/pandas/core/dtypes/cast.py", line 997, in astype_nansafe
    return arr.astype(dtype, copy=True)
ValueError: could not convert string to float: 'East Hannah'

msgonzaga avatar Dec 09 '20 21:12 msgonzaga

Hello @msgonzaga , we managed to reproduce the following error that you had using your metadata and generating some dummy values. We will keep you updated on it once we find what causes this problem.

Traceback (most recent call last):
  File "example_sdv.py", line 59, in <module>
    sample = multi_table_gen(folder)
  File "example_sdv.py", line 41, in multi_table_gen
    model.fit(tables)
  File "/usr/local/lib/python3.8/dist-packages/sdv/relational/base.py", line 63, in fit
    self._fit(tables)
  File "/usr/local/lib/python3.8/dist-packages/sdv/relational/hma.py", line 199, in _fit
    self._model_table(table_name, tables)
  File "/usr/local/lib/python3.8/dist-packages/sdv/relational/hma.py", line 164, in _model_table
    table = self._extend_table(table, tables, table_name)
  File "/usr/local/lib/python3.8/dist-packages/sdv/relational/hma.py", line 103, in _extend_table
    extension = self._get_extension(child_name, child_table, child_key)
  File "/usr/local/lib/python3.8/dist-packages/sdv/relational/hma.py", line 90, in _get_extension
    model.fit(child_rows.reset_index(drop=True))
  File "/usr/local/lib/python3.8/dist-packages/sdv/tabular/base.py", line 112, in fit
    transformed = self._metadata.transform(data)
  File "/usr/local/lib/python3.8/dist-packages/sdv/metadata/table.py", line 515, in transform
    return self._hyper_transformer.transform(data)
  File "/usr/local/lib/python3.8/dist-packages/rdt/hyper_transformer.py", line 155, in transform
    transformed = transformer.transform(column)
  File "/usr/local/lib/python3.8/dist-packages/rdt/transformers/categorical.py", line 174, in transform
    return data.fillna(np.nan).apply(self._get_value).to_numpy()
  File "/usr/local/lib/python3.8/dist-packages/pandas/core/series.py", line 4212, in apply
    mapped = lib.map_infer(values, f, convert=convert_dtype)
  File "pandas/_libs/lib.pyx", line 2403, in pandas._libs.lib.map_infer
  File "/usr/local/lib/python3.8/dist-packages/rdt/transformers/categorical.py", line 147, in _get_value
    mean, std = self.intervals[category][2:]
KeyError: nan

pvk-developer avatar Mar 25 '21 13:03 pvk-developer