gt
gt copied to clipboard
Performance of `fmt_*` in Interactive Tables
Prework
- [x] Read and abide by gt's code of conduct and contributing guidelines.
- [x] Search for duplicates among the existing issues (both open and closed).
Proposal
Performance discussion as followup from #1524
Interactive is definitely slower than "standard"... but our big problems were around these packageVersion
calls (still not entirely sure why they were so slow...) and with some markdown columns.
This is (a much tidied up version of) the script I was using to debug our perf issues...
library(gt)
library(tidyverse)
library(tictoc)
townyBase <-
towny |>
dplyr::select(name, starts_with("population"))
townyPlain <- function() {
townyBase |>
dplyr::mutate(
Link = glue::glue("<a href='https://google.com?q={htmltools::urlEncodePath(name)}'>{htmltools::htmlEscape(townyBase$name)}</a>")
)
}
townyHtml <- function() {
townyPlain() |>
dplyr::mutate(
Link = purrr::map(Link, ~ .x |> gt::html())
)
}
townyTag <- function() {
townyBase |>
dplyr::mutate(
Link = purrr::map(name, ~ { htmltools::tags$a(href= glue::glue("https://google.com?q={htmltools::urlEncodePath(.x)}"), .x) })
)
}
townyTagHtml <- function() {
townyTag() |>
dplyr::mutate(
Link = purrr::map(Link, ~ .x |> as.character() |> gt::html())
)
}
townyMd <- function() {
townyBase |>
dplyr::mutate(
Link = glue::glue("[{name}](https://google.com?q={htmltools::urlEncodePath(name)})")
)
}
townyMd2 <- function() {
townyMd() |>
dplyr::mutate(
Link = purrr::map(Link, ~ .x |> gt::md())
)
}
do_timed_call <- function(label, data, extras, final_opt) {
tictoc::tic(label)
tictoc::tic(paste0(label, "_data"))
fetched <- data()
tictoc::toc()
tictoc::tic(paste0(label, "_creating"))
table <-
fetched |>
gt() |>
fmt_integer() |>
cols_label_with(fn = function(x) sub("population_", "", x)) |>
cols_width(
name ~ px(200)
) |>
tab_header(
title = "Populations of Municipalities",
subtitle = "Census values from 1996 to 2021."
) |>
tab_source_note(source_note = md("Data taken from the `towny` dataset.")) |>
extras() |>
final_opt()
tictoc::toc()
tictoc::tic(paste0(label, "_rendering"))
html <-
table |>
gt:::as.tags.gt_tbl()
tictoc::toc()
tictoc::toc()
list(
table = table,
html = html
)
}
p1 <-
do_timed_call(
"plain",
townyPlain,
function(tab) { tab |> fmt_passthrough("Link", escape = FALSE) },
identity
)
p2 <-
do_timed_call(
"plain interactive",
townyPlain,
function(tab) { tab |> fmt_passthrough("Link", escape = FALSE) },
opt_interactive
)
p3 <-
do_timed_call(
"html",
townyHtml,
identity,
identity
)
p4 <-
do_timed_call(
"html interactive",
townyHtml,
identity,
opt_interactive
)
p5 <-
do_timed_call(
"md",
townyMd,
function(tab) { tab |> fmt_markdown("Link") },
identity
)
p6 <-
do_timed_call(
"md interactive",
townyMd,
function(tab) { tab |> fmt_markdown("Link") },
opt_interactive
)
p7 <-
do_timed_call(
"md2",
townyMd2,
identity,
identity
)
p8 <-
do_timed_call(
"md2 interactive",
townyMd2,
identity,
opt_interactive
)
Output here is:
plain_data: 0.05 sec elapsed
plain_creating: 0.06 sec elapsed
plain_rendering: 0.37 sec elapsed
plain: 0.48 sec elapsed
plain interactive_data: 0.04 sec elapsed
plain interactive_creating: 0.06 sec elapsed
plain interactive_rendering: 1.26 sec elapsed
plain interactive: 1.36 sec elapsed
html_data: 0.08 sec elapsed
html_creating: 0.05 sec elapsed
html_rendering: 0.4 sec elapsed
html: 0.53 sec elapsed
html interactive_data: 0.05 sec elapsed
html interactive_creating: 0.06 sec elapsed
html interactive_rendering: 1.47 sec elapsed
html interactive: 1.58 sec elapsed
md_data: 0.03 sec elapsed
md_creating: 0.04 sec elapsed
md_rendering: 3.06 sec elapsed
md: 3.13 sec elapsed
md interactive_data: 0.03 sec elapsed
md interactive_creating: 0.09 sec elapsed
**md interactive_rendering: 23.57 sec elapsed**
md interactive: 23.69 sec elapsed
md2_data: 0.03 sec elapsed
md2_creating: 0.03 sec elapsed
md2_rendering: 0.36 sec elapsed
md2: 0.42 sec elapsed
md2 interactive_data: 0.04 sec elapsed
md2 interactive_creating: 0.05 sec elapsed
md2 interactive_rendering: 1.72 sec elapsed
md2 interactive: 1.81 sec elapsed
The slowest call by far was p6
/'md interactive
which uses fmt_markdown
- it's 23 seconds compared to the sub 2 seconds for p8
/md2
rendering which uses md()
for rendering the markdown text...
The profiler suggested that render_formats/md_to_html
was doing something (looping?) which was much slower/heavier than migrate_unformatted_to_html
:
fmt_markdown | md |
---|---|
10x slower | "ok" |
I played with some additional code this evening - looking at fmt_integer
library(gt)
library(tidyverse)
library(tictoc)
townyBase <-
towny |>
dplyr::select(name, starts_with("population"))
townyPlain <- function() {
townyBase
}
townyFormatted <- function() {
townyBase |>
mutate(across(starts_with("population"), gt::vec_fmt_integer))
}
do_timed_call <- function(label, data, do_fmt, do_interactive) {
if (do_fmt) {
fn <- function(data) data |> fmt_integer(columns = starts_with("population_"))
} else {
fn <- identity
}
if (do_interactive) {
final_opt <- opt_interactive
} else {
final_opt <- identity
}
tictoc::tic(label)
tictoc::tic(paste0(label, "_data"))
fetched <- data()
tictoc::toc()
tictoc::tic(paste0(label, "_creating"))
table <-
fetched |>
gt() |>
fn() |>
final_opt()
tictoc::toc()
tictoc::tic(paste0(label, "_rendering"))
html <-
table |>
gt:::as.tags.gt_tbl()
tictoc::toc()
tictoc::toc()
list(
table = table,
html = html
)
}
plain_no_format <-
do_timed_call(
"plain",
townyPlain,
FALSE,
FALSE
)
#plain_no_format$table
plain_format <-
do_timed_call(
"plain_format",
townyPlain,
TRUE,
FALSE
)
#plain_format$table
plain_int <-
do_timed_call(
"interactive",
townyPlain,
FALSE,
TRUE
)
#plain_int$table
plain_format_int <-
do_timed_call(
"interactive_format",
townyPlain,
TRUE,
TRUE
)
#plain_format_int$table
preformatted <-
do_timed_call(
"preformatted",
townyFormatted,
FALSE,
FALSE
)
preformatted$table
preformatted_interactive <-
do_timed_call(
"preformatted_interactive",
townyFormatted,
FALSE,
TRUE
)
preformatted_interactive$table
This gives output:
plain_data: 0 sec elapsed
plain_creating: 0.07 sec elapsed
plain_rendering: 0.28 sec elapsed
plain: 0.36 sec elapsed
plain_format_data: 0 sec elapsed
plain_format_creating: 0.02 sec elapsed
plain_format_rendering: 0.4 sec elapsed
plain_format: 0.42 sec elapsed
interactive_data: 0 sec elapsed
interactive_creating: 0.04 sec elapsed
interactive_rendering: 0.34 sec elapsed
interactive: 0.38 sec elapsed
interactive_format_data: 0 sec elapsed
interactive_format_creating: 0.05 sec elapsed
interactive_format_rendering: 1.29 sec elapsed
interactive_format: 1.34 sec elapsed
preformatted_data: 0.3 sec elapsed
preformatted_creating: 0.02 sec elapsed
preformatted_rendering: 0.28 sec elapsed
preformatted: 0.6 sec elapsed
preformatted_interactive_data: 0.29 sec elapsed
preformatted_interactive_creating: 0.05 sec elapsed
preformatted_interactive_rendering: 0.3 sec elapsed
preformatted_interactive: 0.64 sec elapsed
The standout slowest rendering occurs when opt_interactive
and fmt_integer
are both active. Looking at the profile output for this, it looks similar to the markdown case:
with flame:
Digging into this... One thing that seems to be happening is that for interactive tables when a fmt_
is applied then render_formats
is called on the entire table 1+num_columns times:
- it's called once:
- in
build_data
called on https://github.com/rstudio/gt/blob/2e7d5d246b432ec79113452fd4413acbb493e53c/R/render_as_i_html.R#L37 - which calls
render_formats
here https://github.com/rstudio/gt/blob/2e7d5d246b432ec79113452fd4413acbb493e53c/R/build_data.R#L45
- in
- it's then called once per column
- in this per-column
lapply
: https://github.com/rstudio/gt/blob/2e7d5d246b432ec79113452fd4413acbb493e53c/R/render_as_i_html.R#L202 - which calls
extract_cells
which callsrender_formats
again: https://github.com/rstudio/gt/blob/2e7d5d246b432ec79113452fd4413acbb493e53c/R/export.R#L1584C13-L1584C13
- in this per-column
It feels like this multiple looping should be avoided - it looks like the data
passed to extract_cells
is the already formatted table content - it looks like extract_cells
is repeating lots of processing which has already happened in build_data
? (... but as someone who's not really familiar with this code, I might have this wrong?)
Another simpler demo:
library(gt)
library(tidyverse)
library(tictoc)
demo_length <- 10000
my_table <-
tibble(
Name = # generate random names
stringi::stri_rand_strings(demo_length, 10),
Population = round(runif(demo_length, 100000000, 1000000000)),
Score = runif(demo_length, 0, 1)
)
tictoc::tic("fmt_")
html1 <- my_table |>
gt() |>
fmt_number(columns = "Population") |>
fmt_percent(columns = "Score") |>
opt_interactive() |>
gt:::as.tags.gt_tbl()
tictoc::toc()
tictoc::tic("preformatted")
html2 <- my_table |>
mutate(Population = gt::vec_fmt_integer(Population)) |>
mutate(Score = gt::vec_fmt_percent(Score)) |>
gt() |>
opt_interactive() |>
gt:::as.tags.gt_tbl()
tictoc::toc()
This yields output like:
> source("~/.active-rstudio-document")
fmt_: 5.61 sec elapsed
preformatted: 2.7 sec elapsed
> source("~/.active-rstudio-document")
fmt_: 5.44 sec elapsed
preformatted: 2.7 sec elapsed
> source("~/.active-rstudio-document")
fmt_: 5.77 sec elapsed
preformatted: 2.64 sec elapsed
Running profvis
on the two sets of gt blocks of code shows the repeated render_formats
calls:
preformatted | fmt_ |
---|---|
Closing this - as I think #1529 merged yesterday addresses the main issue here 👍