Some hashes change when importing one storr into another.
Portability and collaboration in drake would greatly improve if we could convert caches among different storr backends (ref: https://github.com/ropensci/unconf18/issues/30). Currently, I am having trouble with $import(). It appears to change the values of some of the hashes.
# Generate a cache and a flat log of hashes.
library(drake) # https://github.com/ropensci/drake/commit/6f0012a17c05fa16981d7e027240c40862790fa9
library(storr) # https://github.com/richfitz/storr/commit/27508231b3c061afed9bb243d4422f81291f2e94
load_mtcars_example()
make(my_plan, verbose = FALSE)
cache1 <- storr_rds(".drake")
# Try to convert the RDS cache into a DBI cache.
mydb <- DBI::dbConnect(RSQLite::SQLite(), "database-file.sqlite")
cache2 <- storr::storr_dbi(
tbl_data = "datatable",
tbl_keys = "keystable",
con = mydb,
hash_algorithm = cache1$driver$hash_algorithm
)
cache2$import(cache1, namespace = cache1$list_namespaces())
# Some targets have different hashes.
cache1$get_hash("regression1_large", namespace = "kernels")
#> [1] "e1501ed9d62b846e"
cache2$get_hash("regression1_large", namespace = "kernels")
#> [1] "378d49d1626bdd6f"
# Show more differences.
log1 <- drake_cache_log(cache = cache1)
log2 <- drake_cache_log(cache = cache2)
diff <- which(log1$hash != log2$hash)
log1[diff, ]
#> # A tibble: 2 x 3
#> hash type name
#> <chr> <chr> <chr>
#> 1 e1501ed9d62b846e target regression1_large
#> 2 2a400716e73eac8f target regression1_small
log2[diff, ]
#> # A tibble: 2 x 3
#> hash type name
#> <chr> <chr> <chr>
#> 1 378d49d1626bdd6f target regression1_large
#> 2 8c0111e4bce91e86 target regression1_small
Created on 2018-12-12 by the reprex package (v0.2.1)
shorter:
> library(drake)
> library(storr)
> load_mtcars_example()
> make(my_plan, verbose = FALSE)
> cache1 <- storr_rds(".drake")
> cache1$get_hash("regression1_large", namespace = "kernels")
[1] "e1501ed9d62b846e"
> cache1$hash_object(cache1$get("regression1_large", namespace = "kernels"))
[1] "378d49d1626bdd6f"
and
> cache2 <- storr::storr_rds(tempfile(), hash_algorithm = "xxhash64")
> cache2$set("obj", obj)
> cache2$get_hash("obj")
[1] "378d49d1626bdd6f"
It looks an awful lot like this is an environment serialisation issue
h <- setdiff(cache1$list_hashes(), cache2$list_hashes())
changed <- setNames(lapply(h, cache1$get_value), h)
which shows
$`031df699ec5b0faf`
<storr>
Public:
archive_export: function (path, names = NULL, namespace = NULL)
archive_import: function (path, names = NULL, namespace = NULL)
check: function (full = TRUE, quiet = FALSE, progress = !quiet)
clear: function (namespace = self$default_namespace)
clone: function (deep = FALSE)
default_namespace: objects
del: function (key, namespace = self$default_namespace)
destroy: function ()
driver: driver_rds, R6
duplicate: function (key_src, key_dest, namespace = self$default_namespace,
envir: environment
exists: function (key, namespace = self$default_namespace)
exists_object: function (hash)
export: function (dest, list = NULL, namespace = self$default_namespace,
fill: function (key, value, namespace = self$default_namespace, use_cache = TRUE)
flush_cache: function ()
gc: function ()
get: function (key, namespace = self$default_namespace, use_cache = TRUE)
get_hash: function (key, namespace = self$default_namespace)
get_value: function (hash, use_cache = TRUE)
hash_object: function (object)
hash_raw: function (x)
import: function (src, list = NULL, namespace = self$default_namespace,
index_export: function (namespace = NULL)
index_import: function (index)
initialize: function (driver, default_namespace)
list: function (namespace = self$default_namespace)
list_hashes: function ()
list_namespaces: function ()
mget: function (key, namespace = self$default_namespace, use_cache = TRUE,
mget_hash: function (key, namespace = self$default_namespace)
mget_value: function (hash, use_cache = TRUE, missing = NULL)
mset: function (key, value, namespace = self$default_namespace, use_cache = TRUE)
mset_by_value: function (value, namespace = self$default_namespace, use_cache = TRUE)
mset_value: function (values, use_cache = TRUE)
repair: function (storr_check_results = NULL, quiet = FALSE, ..., force = FALSE)
serialize_object: function (object)
set: function (key, value, namespace = self$default_namespace, use_cache = TRUE)
set_by_value: function (value, namespace = self$default_namespace, use_cache = TRUE)
set_value: function (value, use_cache = TRUE)
traits: list
$`2a400716e73eac8f`
Call:
lm(formula = y ~ +x, data = d)
Coefficients:
(Intercept) x
36.663 -5.008
$`410f8f336035ee86`
function (i)
0.01
<environment: 0x51aac60>
$`5b6556b5a3ba478b`
IGRAPH b492388 DN-- 20 23 --
+ attr: name (v/c)
+ edges from b492388 (vertex names):
[1] random_rows ->simulate
[2] reg1 ->regression1_small
[3] reg1 ->regression1_large
[4] reg2 ->regression2_small
[5] reg2 ->regression2_large
[6] "report.Rmd" ->report
[7] coef_regression2_small->report
[8] large ->report
+ ... omitted several edges
$`901f3af361be4d69`
[1] ‘6.2.1’
$e1501ed9d62b846e
Call:
lm(formula = y ~ +x, data = d)
Coefficients:
(Intercept) x
36.291 -5.108
$f81d15390b8fc36e
IGRAPH b492388 DN-- 20 23 --
+ attr: name (v/c)
+ edges from b492388 (vertex names):
[1] random_rows ->simulate
[2] reg1 ->regression1_small
[3] reg1 ->regression1_large
[4] reg2 ->regression2_small
[5] reg2 ->regression2_large
[6] "report.Rmd" ->report
[7] coef_regression2_small->report
[8] large ->report
+ ... omitted several edges
which are all environment-containing with the possible exception of the version number
ah, the version number is just not there in the second cache for reasons that also look suspicious