visdat
visdat copied to clipboard
explore how to make visdat work with facetting
as per Sam Firke's tweet:
https://twitter.com/samfirke/status/984425923243134976
Some thoughts on this.
I think that one good way forward, rather than (perhaps _only) supplying a "facet" argument as in the naniar::gg_*
family, there could be a "data method" for visdat.
This is already kind of provided, I think, in vis_gather_
.
This could instead be exported, and called something (slightly) better like data_vis_dat
. These data_*
methods would provide the underlying data structure.
These could then have a .grouped_df
method. So you would do something like
data %>%
group_by(grouping) %>%
# get the data structure
data_vis_dat() %>%
# perhaps vis_dat gains some S3 methods, so that it works with a grouped_df, and maybe has a special `.vis_dat` class?
vis_dat()
This seems like a lot more work than just:
vis_dat(data, facet = grouping)
But it would allow for perhaps more flexible operations.
I don't think I can use facet
as in regular ggplot, since that usually requires a change in the datastructure first.
I want to pursue this idea, but at a later date
Just a note, I repeatedly need this ability and so wrote a little hack using the patchwork
package that makes individual vis_dat()
plots for each index value and then combines them into a single plot. This was critical in showing me where I had a missing year of data that I had not realized previously. Given that a primary use of visdat
is to visualize missing values, I am even more convinced that this would feature would be incredibly value.
See example below (this is data from the IPEDS data on higher ed institutions):
If anyone wants to take my code and modify it to their own purpose, here you go (don't judge me, it was a rush job). This is custom for a specific purpose (IPEDS data), so will take a little work to generalize. And I'm not suggesting this as a good method for the actual visdat
package, just a hack for anyone to use in the mean time.
ipeds_visdat <- function(.data, years = "all", .sample_frac = .10) {
#Check that data is ipeds survey
if(!all(c("unitid", "year") %in% names(.data))) warning(".data does not contain a unitid or year column. Are you sure you passed an ipeds survey?")
#Make sure years is set
if(!all(years == "all" | is.numeric(years))) stop("\`years\` must be \"all\" or a numeric vector of 4-digit years.")
if(all(years == "all")) years <- min(.data$year):max(.data$year)
if(.sample_frac < 1) {
cli::cli_alert_info("Sampling data at {.sample_frac * 100}% per year.")
.data <- .data %>%
dplyr::group_by(year) %>%
dplyr::sample_frac(.sample_frac) %>%
dplyr::ungroup()
} else cli::cli_alert_info("Using 100% of data, this may be slow.")
p1 <- .data %>%
dplyr::filter(year == years[1]) %>% visdat::vis_dat(warn_large_data = F, sort_type = F, palette = "qual") +
ggplot2::labs(y = years[1]) + ggplot2::theme(plot.margin = ggplot2::margin(0, 5.5, 0, 5.5, "pt"))
plist <- tibble::lst()
plist[[1]] <- p1
if(length(years > 1)) {
for(i in 2:length(years)) {
plist[[i]] <- .data %>%
dplyr::filter(year == years[{i}]) %>%
visdat::vis_dat(warn_large_data = F, sort_type = F, palette = "qual") +
ggplot2::labs(y = years[{i}]) +
ggplot2::theme(axis.text.x = ggplot2::element_blank(), plot.margin = ggplot2::margin(0, 5.5, 0, 5.5, "pt"))
}
}
patchwork::wrap_plots(plist, ncol = 1, guides = "collect")
}
@jzadra I've worked on an approach for this in https://github.com/ropensci/visdat/pull/149, how does this look to you? Currently I've just got vis_dat
and vis_cor
:
library(visdat)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
vis_dat(airquality)
vis_dat(airquality, facet = Month)
vis_cor(airquality)
vis_cor(airquality, facet = Month)
airquality %>% data_vis_dat()
#> # A tibble: 918 × 4
#> rows variable valueType value
#> <int> <chr> <chr> <chr>
#> 1 1 Day integer 41
#> 2 1 Month integer 190
#> 3 1 Ozone integer 7.4
#> 4 1 Solar.R integer 67
#> 5 1 Temp integer 5
#> 6 1 Wind numeric 1
#> 7 2 Day integer 36
#> 8 2 Month integer 118
#> 9 2 Ozone integer 8
#> 10 2 Solar.R integer 72
#> # … with 908 more rows
airquality %>% group_by(Month) %>% data_vis_dat()
#> # A tibble: 765 × 5
#> # Groups: Month [5]
#> Month rows variable valueType value
#> <int> <int> <chr> <chr> <chr>
#> 1 5 1 Day integer 41
#> 2 5 1 Ozone integer 190
#> 3 5 1 Solar.R integer 7.4
#> 4 5 1 Temp integer 67
#> 5 5 1 Wind numeric 1
#> 6 5 2 Day integer 36
#> 7 5 2 Ozone integer 118
#> 8 5 2 Solar.R integer 8
#> 9 5 2 Temp integer 72
#> 10 5 2 Wind numeric 2
#> # … with 755 more rows
airquality %>% data_vis_cor()
#> # A tibble: 36 × 3
#> row_1 row_2 value
#> <chr> <chr> <dbl>
#> 1 Ozone Ozone 1
#> 2 Ozone Solar.R 0.348
#> 3 Ozone Wind -0.602
#> 4 Ozone Temp 0.698
#> 5 Ozone Month 0.165
#> 6 Ozone Day -0.0132
#> 7 Solar.R Ozone 0.348
#> 8 Solar.R Solar.R 1
#> 9 Solar.R Wind -0.0568
#> 10 Solar.R Temp 0.276
#> # … with 26 more rows
airquality %>% group_by(Month) %>% data_vis_cor()
#> # A tibble: 125 × 4
#> # Groups: Month [5]
#> Month row_1 row_2 value
#> <int> <chr> <chr> <dbl>
#> 1 5 Ozone Ozone 1
#> 2 5 Ozone Solar.R 0.243
#> 3 5 Ozone Wind -0.374
#> 4 5 Ozone Temp 0.554
#> 5 5 Ozone Day 0.302
#> 6 5 Solar.R Ozone 0.243
#> 7 5 Solar.R Solar.R 1
#> 8 5 Solar.R Wind -0.227
#> 9 5 Solar.R Temp 0.455
#> 10 5 Solar.R Day -0.0644
#> # … with 115 more rows
Created on 2022-11-25 with reprex v2.0.2
Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.2.1 (2022-06-23)
#> os macOS Monterey 12.3.1
#> system aarch64, darwin20
#> ui X11
#> language (EN)
#> collate en_US.UTF-8
#> ctype en_US.UTF-8
#> tz Australia/Brisbane
#> date 2022-11-25
#> pandoc 2.19.2 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────
#> package * version date (UTC) lib source
#> assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.2.0)
#> cli 3.4.1 2022-09-23 [1] CRAN (R 4.2.0)
#> colorspace 2.0-3 2022-02-21 [1] CRAN (R 4.2.0)
#> curl 4.3.3 2022-10-06 [1] CRAN (R 4.2.0)
#> DBI 1.1.3 2022-06-18 [1] CRAN (R 4.2.0)
#> digest 0.6.30 2022-10-18 [1] CRAN (R 4.2.0)
#> dplyr * 1.0.10 2022-09-01 [1] CRAN (R 4.2.0)
#> ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.2.0)
#> evaluate 0.17 2022-10-07 [1] CRAN (R 4.2.0)
#> fansi 1.0.3 2022-03-24 [1] CRAN (R 4.2.0)
#> farver 2.1.1 2022-07-06 [1] CRAN (R 4.2.0)
#> fastmap 1.1.0 2021-01-25 [1] CRAN (R 4.2.0)
#> fs 1.5.2 2021-12-08 [1] CRAN (R 4.2.0)
#> generics 0.1.3 2022-07-05 [1] CRAN (R 4.2.0)
#> ggplot2 3.3.6 2022-05-03 [1] CRAN (R 4.2.0)
#> glue 1.6.2 2022-02-24 [1] CRAN (R 4.2.0)
#> gtable 0.3.1 2022-09-01 [1] CRAN (R 4.2.0)
#> highr 0.9 2021-04-16 [1] CRAN (R 4.2.0)
#> htmltools 0.5.3 2022-07-18 [1] CRAN (R 4.2.0)
#> httr 1.4.4 2022-08-17 [1] CRAN (R 4.2.0)
#> knitr 1.40 2022-08-24 [1] CRAN (R 4.2.0)
#> labeling 0.4.2 2020-10-20 [1] CRAN (R 4.2.0)
#> lifecycle 1.0.3 2022-10-07 [1] CRAN (R 4.2.0)
#> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.2.0)
#> mime 0.12 2021-09-28 [1] CRAN (R 4.2.0)
#> munsell 0.5.0 2018-06-12 [1] CRAN (R 4.2.0)
#> pillar 1.8.1 2022-08-19 [1] CRAN (R 4.2.0)
#> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.2.0)
#> purrr 0.3.5 2022-10-06 [1] CRAN (R 4.2.0)
#> R.cache 0.16.0 2022-07-21 [1] CRAN (R 4.2.0)
#> R.methodsS3 1.8.2 2022-06-13 [1] CRAN (R 4.2.0)
#> R.oo 1.25.0 2022-06-12 [1] CRAN (R 4.2.0)
#> R.utils 2.12.0 2022-06-28 [1] CRAN (R 4.2.0)
#> R6 2.5.1 2021-08-19 [1] CRAN (R 4.2.0)
#> reprex 2.0.2 2022-08-17 [1] CRAN (R 4.2.0)
#> rlang 1.0.6 2022-09-24 [1] CRAN (R 4.2.0)
#> rmarkdown 2.17 2022-10-07 [1] CRAN (R 4.2.0)
#> rstudioapi 0.14 2022-08-22 [1] CRAN (R 4.2.0)
#> scales 1.2.1 2022-08-20 [1] CRAN (R 4.2.0)
#> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.2.0)
#> stringi 1.7.8 2022-07-11 [1] CRAN (R 4.2.0)
#> stringr 1.4.1 2022-08-20 [1] CRAN (R 4.2.0)
#> styler 1.7.0 2022-03-13 [1] CRAN (R 4.2.0)
#> tibble 3.1.8 2022-07-22 [1] CRAN (R 4.2.0)
#> tidyr 1.2.1 2022-09-08 [1] CRAN (R 4.2.0)
#> tidyselect 1.2.0 2022-10-10 [1] CRAN (R 4.2.0)
#> utf8 1.2.2 2021-07-24 [1] CRAN (R 4.2.0)
#> vctrs 0.4.2 2022-09-29 [1] CRAN (R 4.2.0)
#> visdat * 0.6.0.9000 2022-11-25 [1] local
#> withr 2.5.0 2022-03-03 [1] CRAN (R 4.2.0)
#> xfun 0.33 2022-09-12 [1] CRAN (R 4.2.0)
#> xml2 1.3.3 2021-11-30 [1] CRAN (R 4.2.0)
#> yaml 2.3.5 2022-02-21 [1] CRAN (R 4.2.0)
#>
#> [1] /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
#>
#> ──────────────────────────────────────────────────────────────────────────────
Hi @njtierney,
I think this is a great addition! I think it would be nice if there was an option for how the facets were organized just like in ggplot, as far as number of cols/rows. In many of my use cases, having the data all in one column is much easier to understand at a glance when the grouping variable is continuous or ordinal. The other feature that would help is some sampling options for large data.
Since I last posted, I greatly improved my function to be generalizable to any data (before it was just for IPEDS). In addition, it has the following features:
- Handles multiple methods in line with vis_dat functions: vis_dat, vis_miss, vis_value
- Handles existing grouping structure (as does yours)
- Makes assumptions about taking a sample fraction for large data based on the method and distributes it evenly across groups: for vis_miss and vis_val, it keeps all data. For vis_dat it takes a fraction based on the number of rows.
- Has the option of using parallelization via
furrr
if a future::plan() is set (if it is not, the plan is sequential by default)
Drawbacks/Issues:
- Still uses patchwork
- Can be problemating if there are a large number of groups in terms of the overall size (length) of the resulting plot.
Anyways, I'll share this code in case any of it is useful.
#' vis_dat for grouped data
#' @description Produce a vis_dat plot for ipeds data split by year with optional sampling.
#' `r lifecycle::badge('maturing')`
#'
#' Note that parallel processing is built in if a `future::plan()` is set
#' @importFrom magrittr "%>%"
#' @param ... bare, unquoted column(s) to use as the index to group by. Alternatively will accept a grouped df.
#' @param .sample_frac Percent of observations to sample from each year. Default "auto" samples down to 100,000 rows, split evenly between groups for vis_dat. For vis_miss and vis_value, "auto" uses all data.
#' @param method Which visdat function to use. One of "vis_dat", "vis_miss", or "vis_value". Accepts shorthand "dat", "val", and "miss".
#' @return visdat plot separated by grouping variable.
#' @examples
#' \dontrun{
#' diamonds %>% visdat_grouped(facet_group = cut)
#' }
#' @importFrom rlang .data
#' @export
visdat_grouped <- function(.data, ..., method = "vis_dat", .sample_frac = "auto") {
is_pregrouped <- dplyr::is_grouped_df(.data) #Does the data already have grouping structure?
#Set the visdat function to use
if(stringr::str_detect(method, "dat")) method <- "dat"
if(stringr::str_detect(method, "val")) method <- "val"
if(stringr::str_detect(method, "miss")) method <- "miss"
# for val and miss we want to see all the data, hence auto = 1
if((method == "val" | method == "miss") & .sample_frac == "auto") .sample_frac = 1
# Otherwise downsmample
if(.sample_frac == "auto") {
if(nrow(.data) > 100000) {
.sample_frac <- 100000 / nrow(.data)
cli::cli_alert_info("Large data, automatically down-sampling data at {round(.sample_frac * 100)}%. To disable or change, set .sample_frac to a value between 0 and 1.")
} else .sample_frac <- 1
}
#Group the data
if(is_pregrouped) {
.data <- .data %>%
tibble::add_column(group_index = dplyr::group_indices(.)) %>%
tidyr::unite(group_name, dplyr::group_vars(.), sep = "\n", remove = F) %>%
dplyr::arrange(group_index)
} else {
.data <- .data %>%
dplyr::group_by(...) %>%
tibble::add_column(group_index = dplyr::group_indices(.)) %>%
tidyr::unite(group_name, ..., sep = "\n", remove = F) %>%
dplyr::arrange(group_index)
}
# Do any sampling
if(.sample_frac < 1) {
.data <- .data %>%
dplyr::sample_frac(.sample_frac / dplyr::n_groups(.)) #Needs to be updated, as sample_frac() is superseded. However sample_frac applies the fraction to each group if the data is grouped.
} else cli::cli_alert_info("Using 100% of data, this may be slow.")
#Split the data
.data <- .data %>% dplyr::group_split(.keep = F)
#Methods for each visdat graph
if(method == "dat") {
plist <- .data %>%
furrr::future_map(function(...) {
.data <- tibble::as_tibble(...)
group_name <- .data %>% dplyr::distinct(group_name) %>% dplyr::pull(group_name)
group_index <- .data %>% dplyr::distinct(group_index) %>% dplyr::pull(group_index)
.data <- .data %>% dplyr::select(-group_name, -group_index)
p <- .data %>%
visdat::vis_dat(warn_large_data = F, sort_type = F, palette = "qual") +
ggplot2::labs(y = group_name) +
ggplot2::theme(plot.margin = ggplot2::margin(0, 5.5, 0, 5.5, "pt"))
if(group_index > 1) {
p <- p + ggplot2::theme(axis.text.x = ggplot2::element_blank(),
plot.margin = ggplot2::margin(0, 5.5, 0, 5.5, "pt"))
}
return(p)
})
}
if(method == "val") {
plist <- .data %>%
furrr::future_map(function(...) {
.data <- tibble::as_tibble(...)
group_name <- .data %>% dplyr::distinct(group_name) %>% dplyr::pull(group_name)
group_index <- .data %>% dplyr::distinct(group_index) %>% dplyr::pull(group_index)
.data <- .data %>% dplyr::select(-group_name, -group_index)
p <- .data %>%
dplyr::select(tidyselect::where(is.numeric)) %>%
visdat::vis_value() +
ggplot2::labs(y = group_name) +
ggplot2::theme(plot.margin = ggplot2::margin(0, 5.5, 0, 5.5, "pt"))
if(group_index > 1) {
p <- p + ggplot2::theme(axis.text.x = ggplot2::element_blank(),
plot.margin = ggplot2::margin(0, 5.5, 0, 5.5, "pt"))
}
return(p)
})
}
if(method == "miss") {
plist <- .data %>%
furrr::future_map(function(...) {
.data <- tibble::as_tibble(...)
group_name <- .data %>% dplyr::distinct(group_name) %>% dplyr::pull(group_name)
group_index <- .data %>% dplyr::distinct(group_index) %>% dplyr::pull(group_index)
.data <- .data %>% dplyr::select(-group_name, -group_index)
p <- .data %>%
dplyr::select(tidyselect::where(is.numeric)) %>%
visdat::vis_miss(show_perc = T, warn_large_data = F) +
ggplot2::labs(y = group_name) +
ggplot2::theme(plot.margin = ggplot2::margin(0, 5.5, 0, 5.5, "pt"))
if(group_index > 1) {
p <- p + ggplot2::theme(axis.text.x = ggplot2::element_blank(),
plot.margin = ggplot2::margin(0, 5.5, 0, 5.5, "pt"))
}
return(p)
})
}
patchwork::wrap_plots(plist, ncol = 1, guides = "collect")
}