silicate icon indicating copy to clipboard operation
silicate copied to clipboard

dm as the data model engine

Open mdsumner opened this issue 6 years ago • 6 comments

This looks very promising:

## devtools::install_github("krlmlr/dm")


  library(silicate)
  sc <- SC(minimal_mesh)

  library(dm)

  unclass(sc) %>% as_dm() %>% 
  cdm_add_pk(object, object_) %>% 
  cdm_add_fk(object_link_edge, object_, object) %>% 
  cdm_add_pk(edge, edge_) %>% 
  cdm_add_fk(object_link_edge, edge_, edge) %>% 
  cdm_add_pk(vertex, vertex_) %>% 
  cdm_add_fk(edge, .vx0, vertex) %>% 
  cdm_add_fk(edge, .vx1, vertex)  %>% 
  cdm_draw()

image

tri <- TRI(minimal_mesh)
unclass(tri) %>% as_dm() %>% 
  cdm_add_pk(object, object_) %>% 
  cdm_add_fk(triangle, object_, object) %>% 
  cdm_add_pk(vertex, vertex_) %>% 
  cdm_add_fk(triangle, .vx0, vertex) %>% 
  cdm_add_fk(triangle, .vx1, vertex) %>% 
  cdm_add_fk(triangle, .vx2, vertex) %>% 
  cdm_draw()
  

image

With osmdata we need to turn off checks because object_ is not unique or exclusive.

  library(osmdata)
  
 sci <-  opq ("hampi india") %>%
    add_osm_feature (key="historic", value="ruins") %>%
    osmdata_sc () 
 
    unclass(sci) %>% as_dm() %>% 
    cdm_add_pk(object, object_, check = F) %>% 
    cdm_add_fk(object_link_edge, object_, object, check = FALSE) %>% 
    cdm_add_pk(edge, edge_) %>% 
    cdm_add_fk(object_link_edge, edge_, edge) %>% 
    cdm_add_pk(vertex, vertex_) %>% 
    cdm_add_fk(edge, .vx0, vertex) %>% 
    cdm_add_fk(edge, .vx1, vertex)

    ── Table source ────────────────────────────────────────────────────────────────────────────────────────
    src:  <environment: 0x55b33d6bdce0>
      ── Data model ──────────────────────────────────────────────────────────────────────────────────────────
    Data model object:
      8 tables:  edge, meta, nodes, object ... 
    27 columns
    3 primary keys
    4 references
    ── Rows ────────────────────────────────────────────────────────────────────────────────────────────────
    Total: 489
    edge: 127, meta: 1, nodes: 39, object: 45, object_link_edge: 127, relation_members: 2, relation_properties: 3, vertex: 145
    

@mpadge basically what I was trying to do with https://github.com/hypertidy/rbot but obviously much more promising!

mdsumner avatar Jul 03 '19 00:07 mdsumner

Methods for as_dm

  library(silicate)

  as_dm.SC <- function(x) {
    unclass(x) %>% as_dm() %>% 
      cdm_add_pk(object, object_) %>% 
      cdm_add_fk(object_link_edge, object_, object) %>% 
      cdm_add_pk(edge, edge_) %>% 
      cdm_add_fk(object_link_edge, edge_, edge) %>% 
      cdm_add_pk(vertex, vertex_) %>% 
      cdm_add_fk(edge, .vx0, vertex) %>% 
      cdm_add_fk(edge, .vx1, vertex) 
  }
  as_dm.TRI <- function(x) {
    unclass(x) %>% as_dm() %>% 
      cdm_add_pk(object, object_) %>% 
      cdm_add_fk(triangle, object_, object) %>% 
      cdm_add_pk(vertex, vertex_) %>% 
      cdm_add_fk(triangle, .vx0, vertex) %>% 
      cdm_add_fk(triangle, .vx1, vertex) %>% 
      cdm_add_fk(triangle, .vx2, vertex)
  }
  
  as_dm.ARC <- function(x) {
    unclass(x) %>% as_dm() %>% 
      cdm_add_pk(object, object_) %>% 
      cdm_add_fk(object_link_arc, object_, object) %>% 
      ## here we need table normalization (somehow, possibly by composing ARC from SC)
      cdm_add_pk(arc_link_vertex, arc_, check = FALSE) %>% 
      cdm_add_pk(vertex, vertex_) %>% 
      cdm_add_fk(arc_link_vertex, vertex_, vertex) 
  }
  
  as_dm.PATH <- function(x) {
    unclass(x) %>% as_dm() %>% 
      cdm_add_pk(object, object_) %>% 
      cdm_add_fk(path, object_, object) %>% 
      ## here we need table normalization (somehow, possibly by composing PATH from SC)
      cdm_add_pk(path_link_vertex, path_, check = FALSE) %>% 
      cdm_add_pk(vertex, vertex_) %>% 
      cdm_add_fk(path_link_vertex, vertex_, vertex) 
  }
  
  library(dm)
as_dm(SC(minimal_mesh))
as_dm(TRI(minimal_mesh))
as_dm(ARC(minimal_mesh))
as_dm(PATH(minimal_mesh))

mdsumner avatar Jul 03 '19 01:07 mdsumner

decompose_table is similar to unjoin

unjoin::unjoin(mtcars, am, gear, carb, key_col = "parent_table")

decompose_table(mtcars, new_id, am, gear, carb)

mdsumner avatar Jul 03 '19 01:07 mdsumner

That's awesome - I hadn't even seen that

mpadge avatar Jul 03 '19 06:07 mpadge

We don't currently handle cycles (=parallel edges in this case) very well. Would a "longer form" be suitable for the triangle?

krlmlr avatar Mar 21 '20 06:03 krlmlr

I've actually grappled with that from very early on, and pretty sure I started with a long triangle form. A follow up question, when you declare which table to dm_filter() have you considered a tidygraph::activate workflow, so that a given table is put upfront and stays there?

That originally seemed to me to be the way to go, but maybe it's better to always declare the table in these verbs?

What follows is just here as a note to self, it took me a bit of to and fro to get it working, but it's worth exploring.

Here I try it out on-the-fly with conversion to and from dm, with a filter on object that culls triangles and vertices:

  as_dm_TRI_longform <- function(x) {
  x <- unclass(x)
  x$triangle <- x$triangle %>% 
    tidyr::pivot_longer(starts_with(".vx"),  
                                    names_to = "corner", values_to = "vertex_")
  
  x %>% as_dm() %>% 
    dm_add_pk(object, object_) %>% 
    dm_add_fk(triangle, object_, object) %>% 
    dm_add_pk(vertex, vertex_) %>% 
    dm_add_fk(triangle, vertex_, vertex)
}
as_TRIlongform_dm <- function(x) {
  x <- dm_apply_filters(x) %>% dm_get_tables()
  x$triangle <- x$triangle %>% 
    tidyr::pivot_wider(names_from = corner, 
                values_from = vertex_) %>% 
    tidyr::unnest(cols = c(.vx0, .vx1, .vx2))
  class(x) <- c("TRI", "sc")
  x
}
library(silicate)
#> 
#> Attaching package: 'silicate'
#> The following object is masked from 'package:stats':
#> 
#>     filter
library(dm)
#> 
#> Attaching package: 'dm'
#> 
#> The following object is masked from 'package:stats':
#> 
#>     filter
tri <- TRI(minimal_mesh)
## a dm version of TRI
x <- as_dm_TRI_longform(tri) 
x
#> ── Table source ─────────────────────────────────────────────────────────
#> src:  <environment: R_GlobalEnv>
#> ── Metadata ─────────────────────────────────────────────────────────────
#> Tables: `object`, `triangle`, `vertex`, `meta`
#> Columns: 11
#> Primary keys: 2
#> Foreign keys: 2
#validate_dm(x)
#dm_get_tables(x)

## round-trip
tri_f <- x %>% dm_filter(object, a == 1) %>% as_TRIlongform_dm()
#> Warning: Values in `vertex_` are not uniquely identified; output will contain list-cols.
#> * Use `values_fn = list(vertex_ = list)` to suppress this warning.
#> * Use `values_fn = list(vertex_ = length)` to identify where the duplicates arise
#> * Use `values_fn = list(vertex_ = summary_fun)` to summarise duplicates
par(mfrow = c(1, 2))
plot(tri, col = grey.colors(nrow(tri$triangle)))
plot(tri_f, col = grey.colors(nrow(tri_f$triangle)))

Created on 2020-03-21 by the reprex package (v0.3.0)

mdsumner avatar Mar 21 '20 08:03 mdsumner

This is pretty good

  ## convert TRI to a longform triangle and then to dm
  as_dm_TRI_longform <- function(x) {
  x <- unclass(x)
  x$triangle <- x$triangle %>% 
    tidyr::pivot_longer(starts_with(".vx"),  
                                    names_to = "corner", values_to = "vertex_")
  
  x %>% as_dm() %>% 
    dm_add_pk(object, object_) %>% 
    dm_add_fk(triangle, object_, object) %>% 
    dm_add_pk(vertex, vertex_) %>% 
    dm_add_fk(triangle, vertex_, vertex)
  }
 ## convert a dm with longform TRI to TRI 
 as_TRIlongform_dm <- function(x) {
  x <- x %>% dm_apply_filters() %>% dm_get_tables() %>% 
    purrr::map(dplyr::collect)
    
  x$triangle <- x$triangle %>% 
    tidyr::pivot_wider(names_from = corner, 
                values_from = vertex_) %>% 
    tidyr::unnest(cols = c(.vx0, .vx1, .vx2))
  class(x) <- c("TRI", "sc")
  x
}
library(silicate)
#> 
#> Attaching package: 'silicate'
#> The following object is masked from 'package:stats':
#> 
#>     filter
library(dm)
#> 
#> Attaching package: 'dm'
#> 
#> The following object is masked from 'package:stats':
#> 
#>     filter
 
 ## sf polygons of Provinces, in triangulated form (just because)
 tri <- TRI(inlandwaters)
 ## a dm version of TRI
 x <- as_dm_TRI_longform(tri) 
 x
#> ── Table source ─────────────────────────────────────────────────────────
#> src:  <environment: R_GlobalEnv>
#> ── Metadata ─────────────────────────────────────────────────────────────
#> Tables: `object`, `triangle`, `vertex`, `meta`
#> Columns: 12
#> Primary keys: 2
#> Foreign keys: 2
 ## unlink("afile.sql3")
 src <- dplyr::src_sqlite("afile.sql3", create = TRUE)
 sc <- copy_dm_to(src, x, temporary = FALSE)

 rm(tri, x)
 pryr::object_size(sc)
#> Registered S3 method overwritten by 'pryr':
#>   method      from
#>   print.bytes Rcpp
#> 19.2 kB
 file.info("afile.sql3")$size/1e6
#> [1] 7.589888

 ## apply filters to the object, and vertex tables and collect as TRI
tas_north <- sc %>% 
  dm_filter(object, Province == "Tasmania") %>% 
  dm_filter(vertex, y_ > -1500000) %>% 
  as_TRIlongform_dm()
#> Warning: Values in `vertex_` are not uniquely identified; output will contain list-cols.
#> * Use `values_fn = list(vertex_ = list)` to suppress this warning.
#> * Use `values_fn = list(vertex_ = length)` to identify where the duplicates arise
#> * Use `values_fn = list(vertex_ = summary_fun)` to summarise duplicates

  
## in the original sf "Province == "Tasmania" is a widely distributed
## set of islands, particularly the very tiny and to the far
## SE Macquarie Island, so we pick Tas and then zoom up to the north
## of the main islands of the province
par(mfrow = c(1, 2))
library(sf); plot(inlandwaters[5, 1]$geom, col = "grey")
#> Linking to GEOS 3.8.0, GDAL 3.0.4, PROJ 7.0.0
plot(tas_north)

Created on 2020-03-21 by the reprex package (v0.3.0)

mdsumner avatar Mar 21 '20 10:03 mdsumner