`--split` for splitting extracted columns on their values
https://data.sfgov.org/Economy-and-Community/Mobile-Food-Facility-Permit/rqzj-sfat
Say for example there's a FoodItems column in the CSV that has data in it like this:
Cold Truck: Cheeseburgers: Burgers: Chicken Bake: Chili Dogs: Hot Dogs: Corn Dogs: Cup of Noodles: Egg Muffins: Tamales: Hot Sandwiches Quesadillas: Gatorade: Juice: Soda: Mikl: Coffee: Hot Cocoa: Hot Tea: Flan: Fruits: Fruit Salad: Yogurt: Candy: Chips: Donuts: Cookies: Granola: Muffins & Various Drinks & Pre-Packaged Snacks.
Running the following could create a lookup table with those individual broken out items in it:
csvs-to-sqlite -c FoodItems --split FoodItems ": "
So the --split option takes two arguments - the name of the column, and the separator to split on.
This would be the first feature of csvs-to-sqlite that results in a many-to-many table.
This one would be really helpful.
Hey @simonw , I'm trying a run to this issue because I actually need it and I know I can reuse it. But I'm not sure I'm doing the right thing. Here's the diff, let me know if it's the right way. I'm not really skilled in panda, so I though to use plain sqlite but I'm not sure it's the right direction:
diff --git a/csvs_to_sqlite/cli.py b/csvs_to_sqlite/cli.py
index 10b7d30..15b4e49 100644
--- a/csvs_to_sqlite/cli.py
+++ b/csvs_to_sqlite/cli.py
@@ -42,20 +42,24 @@ import sqlite3
@click.option(
"--table", "-t", help="Table to use (instead of using CSV filename)", default=None
)
[email protected](
+ "--split", multiple=True, help="You can use split to create a many2many relationship based on a split of the "
+ "field using a separator", default=None
+)
@click.option(
"--extract-column",
"-c",
multiple=True,
help=(
- "One or more columns to 'extract' into a separate lookup table. "
- "If you pass a simple column name that column will be replaced "
- "with integer foreign key references to a new table of that "
- "name. You can customize the name of the table like so:\n"
- " state:States:state_name\n\n"
- "This will pull unique values from the 'state' column and use "
- "them to populate a new 'States' table, with an id column "
- "primary key and a state_name column containing the strings "
- "from the original column."
+ "One or more columns to 'extract' into a separate lookup table. "
+ "If you pass a simple column name that column will be replaced "
+ "with integer foreign key references to a new table of that "
+ "name. You can customize the name of the table like so:\n"
+ " state:States:state_name\n\n"
+ "This will pull unique values from the 'state' column and use "
+ "them to populate a new 'States' table, with an id column "
+ "primary key and a state_name column containing the strings "
+ "from the original column."
),
)
@click.option(
@@ -123,25 +127,26 @@ import sqlite3
)
@click.version_option()
def cli(
- paths,
- dbname,
- separator,
- quoting,
- skip_errors,
- replace_tables,
- table,
- extract_column,
- date,
- datetime,
- datetime_format,
- primary_key,
- fts,
- index,
- shape,
- filename_column,
- no_index_fks,
- no_fulltext_fks,
- just_strings,
+ paths,
+ dbname,
+ separator,
+ quoting,
+ skip_errors,
+ replace_tables,
+ table,
+ extract_column,
+ date,
+ datetime,
+ datetime_format,
+ primary_key,
+ fts,
+ index,
+ shape,
+ filename_column,
+ no_index_fks,
+ no_fulltext_fks,
+ just_strings,
+ split,
):
"""
PATHS: paths to individual .csv files or to directories containing .csvs
@@ -152,6 +157,9 @@ def cli(
extract_columns = extract_column
del extract_column
+ splits = split
+ del split
+
if extract_columns:
click.echo("extract_columns={}".format(extract_columns))
if dbname.endswith(".csv"):
@@ -222,6 +230,18 @@ def cli(
for index_defn in index:
add_index(conn, df.table_name, index_defn)
+ # Handle the split
+ for col in splits:
+ bits = col.split(":")
+ pks = [
+ r[1] for r in conn.execute('PRAGMA table_info({table_name});'.format(table_name=name)).fetchall() if r[5] == 1
+ ]
+ columns = map(lambda x: x + '_' + bits[0], pks)
+ c = bits[0] + ',' + ','.join(columns)
+ # TODO create table bits[0]
+ # TODO create joint table
+ print(c)
+
# Create FTS tables
if fts:
fts_version = best_fts_version()
diff --git a/tests/test_csvs_to_sqlite.py b/tests/test_csvs_to_sqlite.py
index e4c5c44..0c39678 100644
--- a/tests/test_csvs_to_sqlite.py
+++ b/tests/test_csvs_to_sqlite.py
@@ -33,6 +33,11 @@ Adaptation,22.5,6 of December in the year 2002
Face/Off,245.7,19 of June in the year 1997
The Rock,134.1,9 of June in the year 1996"""
+CSV_SPLIT_FIELDS = """pk1,name,name2
+1,11;12,11;12
+2,12;23,14;25
+3,21,13;33"""
+
def test_flat():
runner = CliRunner()
@@ -476,7 +481,7 @@ def test_custom_primary_keys():
with runner.isolated_filesystem():
open("pks.csv", "w").write(CSV_CUSTOM_PRIMARY_KEYS)
result = runner.invoke(
- cli.cli, ("pks.csv pks.db -pk pk1 --primary-key pk2").split()
+ cli.cli, "pks.csv pks.db -pk pk1 --primary-key pk2".split()
)
assert result.exit_code == 0
conn = sqlite3.connect("pks.db")
@@ -486,6 +491,26 @@ def test_custom_primary_keys():
assert ["pk1", "pk2"] == pks
+def test_csv_split_fields():
+ runner = CliRunner()
+ with runner.isolated_filesystem():
+ open("split.csv", "w").write(CSV_SPLIT_FIELDS)
+ result = runner.invoke(
+ cli.cli, "split.csv split.db -pk pk1 --split name:\";\"".split()
+ )
+ assert result.stderr == ""
+ assert result.exit_code == 0
+ conn = sqlite3.connect("split.db")
+ rows = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='split';").fetchone()
+ assert 'split' == rows[0]
+ rows = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='name_split';").fetchone()
+ assert 'name_split' == rows[0]
+ rows = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='name';").fetchone()
+ assert 'name' == rows[0]
+ # TODO check that the many to many table exist
+ # TODO check that the
+
+
def test_just_strings_default():
"""
Just like test_flat(), except all columns are strings