csvs-to-sqlite icon indicating copy to clipboard operation
csvs-to-sqlite copied to clipboard

`--split` for splitting extracted columns on their values

Open simonw opened this issue 8 years ago • 3 comments

https://data.sfgov.org/Economy-and-Community/Mobile-Food-Facility-Permit/rqzj-sfat

Say for example there's a FoodItems column in the CSV that has data in it like this:

Cold Truck: Cheeseburgers: Burgers: Chicken Bake: Chili Dogs: Hot Dogs: Corn Dogs: Cup of Noodles: Egg Muffins: Tamales: Hot Sandwiches Quesadillas: Gatorade: Juice: Soda: Mikl: Coffee: Hot Cocoa: Hot Tea: Flan: Fruits: Fruit Salad: Yogurt: Candy: Chips: Donuts: Cookies: Granola: Muffins & Various Drinks & Pre-Packaged Snacks.

Running the following could create a lookup table with those individual broken out items in it:

csvs-to-sqlite -c FoodItems --split FoodItems ": "

So the --split option takes two arguments - the name of the column, and the separator to split on.

simonw avatar Nov 24 '17 23:11 simonw

This would be the first feature of csvs-to-sqlite that results in a many-to-many table.

simonw avatar Nov 26 '17 05:11 simonw

This one would be really helpful.

invasionofsmallcubes avatar Dec 25 '20 00:12 invasionofsmallcubes

Hey @simonw , I'm trying a run to this issue because I actually need it and I know I can reuse it. But I'm not sure I'm doing the right thing. Here's the diff, let me know if it's the right way. I'm not really skilled in panda, so I though to use plain sqlite but I'm not sure it's the right direction:

diff --git a/csvs_to_sqlite/cli.py b/csvs_to_sqlite/cli.py
index 10b7d30..15b4e49 100644
--- a/csvs_to_sqlite/cli.py
+++ b/csvs_to_sqlite/cli.py
@@ -42,20 +42,24 @@ import sqlite3
 @click.option(
     "--table", "-t", help="Table to use (instead of using CSV filename)", default=None
 )
[email protected](
+    "--split", multiple=True, help="You can use split to create a many2many relationship based on a split of the "
+                                   "field using a separator", default=None
+)
 @click.option(
     "--extract-column",
     "-c",
     multiple=True,
     help=(
-        "One or more columns to 'extract' into a separate lookup table. "
-        "If you pass a simple column name that column will be replaced "
-        "with integer foreign key references to a new table of that "
-        "name. You can customize the name of the table like so:\n"
-        "    state:States:state_name\n\n"
-        "This will pull unique values from the 'state' column and use "
-        "them to populate a new 'States' table, with an id column "
-        "primary key and a state_name column containing the strings "
-        "from the original column."
+            "One or more columns to 'extract' into a separate lookup table. "
+            "If you pass a simple column name that column will be replaced "
+            "with integer foreign key references to a new table of that "
+            "name. You can customize the name of the table like so:\n"
+            "    state:States:state_name\n\n"
+            "This will pull unique values from the 'state' column and use "
+            "them to populate a new 'States' table, with an id column "
+            "primary key and a state_name column containing the strings "
+            "from the original column."
     ),
 )
 @click.option(
@@ -123,25 +127,26 @@ import sqlite3
 )
 @click.version_option()
 def cli(
-    paths,
-    dbname,
-    separator,
-    quoting,
-    skip_errors,
-    replace_tables,
-    table,
-    extract_column,
-    date,
-    datetime,
-    datetime_format,
-    primary_key,
-    fts,
-    index,
-    shape,
-    filename_column,
-    no_index_fks,
-    no_fulltext_fks,
-    just_strings,
+        paths,
+        dbname,
+        separator,
+        quoting,
+        skip_errors,
+        replace_tables,
+        table,
+        extract_column,
+        date,
+        datetime,
+        datetime_format,
+        primary_key,
+        fts,
+        index,
+        shape,
+        filename_column,
+        no_index_fks,
+        no_fulltext_fks,
+        just_strings,
+        split,
 ):
     """
     PATHS: paths to individual .csv files or to directories containing .csvs
@@ -152,6 +157,9 @@ def cli(
     extract_columns = extract_column
     del extract_column
 
+    splits = split
+    del split
+
     if extract_columns:
         click.echo("extract_columns={}".format(extract_columns))
     if dbname.endswith(".csv"):
@@ -222,6 +230,18 @@ def cli(
             for index_defn in index:
                 add_index(conn, df.table_name, index_defn)
 
+    # Handle the split
+    for col in splits:
+        bits = col.split(":")
+        pks = [
+            r[1] for r in conn.execute('PRAGMA table_info({table_name});'.format(table_name=name)).fetchall() if r[5] == 1
+        ]
+        columns = map(lambda x: x + '_' + bits[0], pks)
+        c = bits[0] + ',' + ','.join(columns)
+        # TODO create table bits[0]
+        # TODO create joint table
+        print(c)
+
     # Create FTS tables
     if fts:
         fts_version = best_fts_version()
diff --git a/tests/test_csvs_to_sqlite.py b/tests/test_csvs_to_sqlite.py
index e4c5c44..0c39678 100644
--- a/tests/test_csvs_to_sqlite.py
+++ b/tests/test_csvs_to_sqlite.py
@@ -33,6 +33,11 @@ Adaptation,22.5,6 of December in the year 2002
 Face/Off,245.7,19 of June in the year 1997
 The Rock,134.1,9 of June in the year 1996"""
 
+CSV_SPLIT_FIELDS = """pk1,name,name2
+1,11;12,11;12
+2,12;23,14;25
+3,21,13;33"""
+
 
 def test_flat():
     runner = CliRunner()
@@ -476,7 +481,7 @@ def test_custom_primary_keys():
     with runner.isolated_filesystem():
         open("pks.csv", "w").write(CSV_CUSTOM_PRIMARY_KEYS)
         result = runner.invoke(
-            cli.cli, ("pks.csv pks.db -pk pk1 --primary-key pk2").split()
+            cli.cli, "pks.csv pks.db -pk pk1 --primary-key pk2".split()
         )
         assert result.exit_code == 0
         conn = sqlite3.connect("pks.db")
@@ -486,6 +491,26 @@ def test_custom_primary_keys():
         assert ["pk1", "pk2"] == pks
 
 
+def test_csv_split_fields():
+    runner = CliRunner()
+    with runner.isolated_filesystem():
+        open("split.csv", "w").write(CSV_SPLIT_FIELDS)
+        result = runner.invoke(
+            cli.cli, "split.csv split.db -pk pk1 --split name:\";\"".split()
+        )
+        assert result.stderr == ""
+        assert result.exit_code == 0
+        conn = sqlite3.connect("split.db")
+        rows = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='split';").fetchone()
+        assert 'split' == rows[0]
+        rows = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='name_split';").fetchone()
+        assert 'name_split' == rows[0]
+        rows = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='name';").fetchone()
+        assert 'name' == rows[0]
+        # TODO check that the many to many table exist
+        # TODO check that the
+
+
 def test_just_strings_default():
     """
     Just like test_flat(), except all columns are strings

invasionofsmallcubes avatar Dec 26 '20 17:12 invasionofsmallcubes