dm icon indicating copy to clipboard operation
dm copied to clipboard

Guess ptype for `dm_unwrap_tbl()`

Open krlmlr opened this issue 2 years ago • 6 comments

Invariant: dm_unwrap_tbl() with guessed ptype, then dm_wrap_tbl() should yield the same result.

krlmlr avatar Jun 07 '22 15:06 krlmlr

If we changed the way dm_wrap_tbl() works by default, we could make unwrapping much easier to guess. The idea is to:

  • include both foreign table name and name of the key columns in the name of the new nested/packed columns
  • include key columns in the packed table

Packed

library(dm)
library(tidyverse)

nyc13 <- 
  dm_nycflights13() %>% 
  dm_select(flights, origin, dest, time_hour, tailnum, carrier)

nyc13_flights <-
  nyc13 %>%
  dm_wrap_tbl(flights) %>% 
  pull_tbl(flights)
names(nyc13_flights)
#> [1] "origin"    "dest"      "time_hour" "tailnum"   "carrier"   "airlines" 
#> [7] "airports"  "planes"    "weather"

# Current state:
nyc13_flights[1, ] %>% 
  jsonlite::toJSON(pretty = TRUE)
#> [
#>   {
#>     "origin": "JFK",
#>     "dest": "BQN",
#>     "time_hour": "2013-01-10 23:00:00",
#>     "tailnum": "N571JB",
#>     "carrier": "B6",
#>     "airlines": {
#>       "name": "JetBlue Airways"
#>     },
#>     "airports": {
#>       "name": "John F Kennedy Intl",
#>       "lat": 40.6398,
#>       "lon": -73.7789,
#>       "alt": 13,
#>       "tz": -5,
#>       "dst": "A",
#>       "tzone": "America/New_York"
#>     },
#>     "planes": {
#>       "year": 2003,
#>       "type": "Fixed wing multi engine",
#>       "manufacturer": "AIRBUS",
#>       "model": "A320-232",
#>       "engines": 2,
#>       "seats": 200,
#>       "engine": "Turbo-fan"
#>     },
#>     "weather": {
#>       "year": 2013,
#>       "month": 1,
#>       "day": 10,
#>       "hour": 23,
#>       "temp": 39.02,
#>       "dewp": 21.92,
#>       "humid": 49.93,
#>       "wind_dir": 30,
#>       "wind_speed": 4.6031,
#>       "precip": 0,
#>       "pressure": 1034.6,
#>       "visib": 10
#>     }
#>   }
#> ]

# Desired:
nyc13_flights[1, ] %>% 
  jsonlite::toJSON(pretty = TRUE)
#> [
#>   {
#>     "dest": "BQN",
#>     "airports=origin": {
#>       "faa": "JFK",
#>       "name": "John F Kennedy Intl",
#>       "lat": 40.6398,
#>       "lon": -73.7789,
#>       "alt": 13,
#>       "tz": -5,
#>       "dst": "A",
#>       "tzone": "America/New_York"
#>     },
#>     "planes=tailnum": {
#>       "tailnum": "N571JB",
#>       "year": 2003,
#>       "type": "Fixed wing multi engine",
#>       "manufacturer": "AIRBUS",
#>       "model": "A320-232",
#>       "engines": 2,
#>       "seats": 200,
#>       "engine": "Turbo-fan"
#>     },
#>     "airlines=carrier": {
#>       "carrier": "B6",
#>       "name": "JetBlue Airways"
#>     },
#>     "weather=origin,time_hour": {
#>       "origin": "JFK",
#>       "time_hour": "2013-01-10 23:00:00",
#>       "year": 2013,
#>       "month": 1,
#>       "day": 10,
#>       "hour": 23,
#>       "temp": 39.02,
#>       "dewp": 21.92,
#>       "humid": 49.93,
#>       "wind_dir": 30,
#>       "wind_speed": 4.6031,
#>       "precip": 0,
#>       "pressure": 1034.6,
#>       "visib": 10
#>     }
#>   }
#> ]

Created on 2022-06-09 by the reprex package (v2.0.1)

krlmlr avatar Jun 09 '22 11:06 krlmlr

Nested

library(dm)
library(tidyverse)

nyc13 <- 
  dm_nycflights13() %>% 
  dm_select(flights, origin, dest, time_hour, tailnum, carrier) %>% 
  dm_select_tbl(airlines, flights) %>% 
  dm_zoom_to(flights) %>% 
  head(150) %>% 
  dm_update_zoomed()

nyc13_airlines <-
  nyc13 %>%
  dm_wrap_tbl(airlines) %>% 
  pull_tbl(airlines)

# Current state
names(nyc13_airlines)
#> [1] "carrier" "name"    "flights"
nyc13_airlines[1, ] %>% 
  jsonlite::toJSON(pretty = TRUE)
#> [
#>   {
#>     "carrier": "9E",
#>     "name": "Endeavor Air Inc.",
#>     "flights": [
#>       {
#>         "origin": "JFK",
#>         "dest": "ATL",
#>         "time_hour": "2013-01-10 06:00:00",
#>         "tailnum": "N170PQ"
#>       },
#>       {
#>         "origin": "JFK",
#>         "dest": "PIT",
#>         "time_hour": "2013-01-10 07:00:00",
#>         "tailnum": "N801AY"
#>       },
#>       {
#>         "origin": "EWR",
#>         "dest": "CVG",
#>         "time_hour": "2013-01-10 07:00:00",
#>         "tailnum": "N8855A"
#>       }
#>     ]
#>   }
#> ]

# Desired state:
nyc13_airlines[1, ] %>% 
  jsonlite::toJSON(pretty = TRUE)
#> [
#>   {
#>     "carrier": "9E",
#>     "name": "Endeavor Air Inc.",
#>     "flights=carrier": [
#>       {
#>         "origin": "JFK",
#>         "dest": "ATL",
#>         "time_hour": "2013-01-10 06:00:00",
#>         "tailnum": "N170PQ"
#>       },
#>       {
#>         "origin": "JFK",
#>         "dest": "PIT",
#>         "time_hour": "2013-01-10 07:00:00",
#>         "tailnum": "N801AY"
#>       },
#>       {
#>         "origin": "EWR",
#>         "dest": "CVG",
#>         "time_hour": "2013-01-10 07:00:00",
#>         "tailnum": "N8855A"
#>       }
#>     ]
#>   }
#> ]

Created on 2022-06-09 by the reprex package (v2.0.1)

krlmlr avatar Jun 09 '22 11:06 krlmlr

I wonder if this changes our ability to rely on only the ptype to specify the transformation. Should our spec be more like a plan that defines what tables to nest-join or pack-join, and what the names of the new tables should be?

krlmlr avatar Jun 09 '22 11:06 krlmlr

We could still compute directly a spec from the source ptype in dm_wrap_tbl(), but it feels like the spec should be a bit richer or more flexible.

krlmlr avatar Jun 09 '22 11:06 krlmlr

A few comments @krlmlr

  • Some DBMS limit names to as few as 30 bytes (e.g. Oracle 12.1 and below), your proposal makes this limit very easy to reach
  • The link between origin and faa is not enforced, or it would be by ordering the columns
  • We would need to inspect a column to know if it's nested or packed, and that might lead to ambiguities
  • You removed fks keys from flights after packing to place them only in packed columns. A key in the packing table might be used by several parents (in compound keys for example), so I'm not sure that they should be removed when placed in the packed column. In a partially wrapped model this would also break some queries that would have still worked had we kept those.

We might solve those issues by :

  • moving the key information to the inner nested/packed column names. In case of remote DBs this will end in the json thus we wouldn't suffer from character limitation there
  • suffixing outer packed and nested columns respectively with "<" and ">" (both ASCII so if I understand correctly it will only cost us one byte)
  • duplicating key columns so they are found both in the packing and packed table

Your packed example would become :

# Desired:
nyc13_flights[1, ] %>% 
  jsonlite::toJSON(pretty = TRUE)
#> [
#>   {
#>     "origin": "JFK",
#>     "dest": "BQN",
#>     "time_hour": "2013-01-10 23:00:00",
#>     "tailnum": "N571JB",
#>     "carrier": "B6",
#>     "airports<": {
#>       "faa=origin": "JFK",
#>       "name": "John F Kennedy Intl",
#>       "lat": 40.6398,
#>       "lon": -73.7789,
#>       "alt": 13,
#>       "tz": -5,
#>       "dst": "A",
#>       "tzone": "America/New_York"
#>     },
#>     "planes<": {
#>       "tailnum=tailnum": "N571JB",
#>       "year": 2003,
#>       "type": "Fixed wing multi engine",
#>       "manufacturer": "AIRBUS",
#>       "model": "A320-232",
#>       "engines": 2,
#>       "seats": 200,
#>       "engine": "Turbo-fan"
#>     },
#>     "airlines<": {
#>       "carrier= carrier": "B6",
#>       "name": "JetBlue Airways"
#>     },
#>     "weather<,": {
#>       "origin=origin": "JFK",
#>       "time_hour=time_hour": "2013-01-10 23:00:00",
#>       "year": 2013,
#>       "month": 1,
#>       "day": 10,
#>       "hour": 23,
#>       "temp": 39.02,
#>       "dewp": 21.92,
#>       "humid": 49.93,
#>       "wind_dir": 30,
#>       "wind_speed": 4.6031,
#>       "precip": 0,
#>       "pressure": 1034.6,
#>       "visib": 10
#>     }
#>   }
#> ]

your nested example would become :

# Desired state:
nyc13_airlines[1, ] %>% 
  jsonlite::toJSON(pretty = TRUE)
#> [
#>   {
#>     "carrier": "9E",
#>     "name": "Endeavor Air Inc.",
#>     "flights>": [
#>       {
#>         "origin": "JFK",
#>         "dest": "ATL",
#>         "time_hour": "2013-01-10 06:00:00",
#>         "tailnum": "N170PQ",
#>         "carrier=carrier": "9E"
#>       },
#>       {
#>         "origin": "JFK",
#>         "dest": "PIT",
#>         "time_hour": "2013-01-10 07:00:00",
#>         "tailnum": "N801AY",
#>         "carrier=carrier": "9E"
#>       },
#>       {
#>         "origin": "EWR",
#>         "dest": "CVG",
#>         "time_hour": "2013-01-10 07:00:00",
#>         "tailnum": "N8855A",
#>         "carrier=carrier": "9E"
#>       }
#>     ]
#>   }
#> ]

moodymudskipper avatar Sep 02 '22 15:09 moodymudskipper

All of the above doesn't mention primary keys. To reconstruct robustly without ptype we need to tag them too. I propose to suffix col names of primary keys with *.

moodymudskipper avatar Sep 04 '22 12:09 moodymudskipper