phsmethods
phsmethods copied to clipboard
Function to extract file names.
Proposing a function that combines the functionality of utils::fileSnapshot()
and list.files()
using stringr
style string modifiers.
Below is a draft function I've written that will return a tibble with the file path (short and long), file extension, file size, date created, date modified and the pattern that first matched that file name.
It accepts multiple patterns which can either be a character vector, or a list of stringr
string modifiers.
Edit: Fixed a bug and also now using file.info()
instead of fileSnapshot()
as suggested by @jvillacampa.
extract_files <- function(path, pattern = ".", before = Sys.time(), pattern_type = "regex", latest_file = FALSE, ...){
stopifnot(dir.exists(path))
if(!isTRUE(lubridate::is.Date(before) || lubridate::is.POSIXct(before))) stop("Error: Please provide a date or datetime before argument")
path <- normalizePath(path, winslash = "/")
file_list <- file.info(list.files(path, full.names = TRUE), extra_cols = TRUE)
file_list[["file_long"]] <- row.names(file_list)
file_list[["file"]] <- stringr::str_remove(file_list[["file_long"]], pattern = path)
file_list[["cdate"]] <- lubridate::as_date(file_list[["ctime"]])
file_list <- tidyr::as_tibble(file_list)
file_extract <- function(max_date){
if (lubridate::is.Date(max_date)) {
my_files <- file_list %>%
dplyr::filter(.data[["cdate"]] <= lubridate::as_date(max_date))
} else {
my_files <- file_list %>%
dplyr::filter(.data[["ctime"]] <= max_date)
}
# If pattern contains stringr modifiers, then use them, else will just use pattern_type as the string modifier
is_stringr_modifier <- function(x) {
isTRUE(all.equal(c("pattern", "character"), class(x)[2:3])) &&
class(x)[1] %in% c("regex", "fixed", "coll")
}
if (length(pattern) == 1 & is_stringr_modifier(pattern)) pattern <- list(pattern)
if (!is.list(pattern)) pattern <- as.list(pattern)
stringr_modifier <- purrr::map_lgl(pattern, is_stringr_modifier)
if (any(stringr_modifier)) {
# Loop over patterns and find matching files
my_files <- purrr::map_dfr(pattern, function(x) my_files %>%
dplyr::filter(stringr::str_detect(.data[["file"]], x)) %>%
dplyr::mutate(file_pattern = list(x)))
# Using pattern_type
} else {
my_files <- purrr::map_dfr(pattern, function(x) {
x_pattern <- do.call(get(tolower(pattern_type), asNamespace("stringr")), list(x, ...))
return(my_files %>%
dplyr::filter(stringr::str_detect(.data[["file"]], x_pattern)) %>%
dplyr::mutate(file_pattern = x_pattern))
})
}
# For each pattern find latest file(s)
if (latest_file) {
my_files_out <- my_files %>%
dplyr::group_by(.data[["file_pattern"]]) %>%
dplyr::arrange(dplyr::desc(.data[["ctime"]])) %>%
dplyr::slice_head(n = 1) %>%
dplyr::mutate(before_date = max_date) %>%
dplyr::ungroup()
} else {
my_files_out <- my_files %>%
dplyr::mutate(before_date = max_date)
}
my_files_out %>%
dplyr::mutate(file_type = dplyr::if_else(!.data[["isdir"]], # Get file extension
stringr::str_extract(.data[["file"]], "(?<=\\.)[:alnum:]{1,}$"),
"dir"))
}
# Extract files for each before element
my_files_df <- purrr::map_dfr(before, file_extract) %>%
dplyr::distinct(.data[["file"]], .keep_all = TRUE) %>%
dplyr::select(file = .data[["file"]],
file_long = .data[["file_long"]],
file_type = .data[["file_type"]],
size = .data[["size"]],
date_created = .data[["ctime"]],
date_modified = .data[["mtime"]],
pattern_match = .data[["file_pattern"]])
my_files_df
}