phsmethods icon indicating copy to clipboard operation
phsmethods copied to clipboard

Function to extract file names.

Open Nic-Chr opened this issue 3 years ago • 3 comments

Proposing a function that combines the functionality of utils::fileSnapshot() and list.files() using stringr style string modifiers.

Below is a draft function I've written that will return a tibble with the file path (short and long), file extension, file size, date created, date modified and the pattern that first matched that file name. It accepts multiple patterns which can either be a character vector, or a list of stringr string modifiers.

Edit: Fixed a bug and also now using file.info() instead of fileSnapshot() as suggested by @jvillacampa.

extract_files <- function(path, pattern = ".", before = Sys.time(), pattern_type = "regex", latest_file = FALSE, ...){
  stopifnot(dir.exists(path))
  if(!isTRUE(lubridate::is.Date(before) || lubridate::is.POSIXct(before))) stop("Error: Please provide a date or datetime before argument")
  path <- normalizePath(path, winslash = "/")
  file_list <- file.info(list.files(path, full.names = TRUE), extra_cols = TRUE)
  file_list[["file_long"]] <- row.names(file_list)
  file_list[["file"]] <- stringr::str_remove(file_list[["file_long"]], pattern = path)
  file_list[["cdate"]] <- lubridate::as_date(file_list[["ctime"]])
  file_list <- tidyr::as_tibble(file_list)
  file_extract <- function(max_date){
    if (lubridate::is.Date(max_date)) {
      my_files <- file_list %>%
        dplyr::filter(.data[["cdate"]] <= lubridate::as_date(max_date))
    } else {
      my_files <- file_list %>%
        dplyr::filter(.data[["ctime"]] <= max_date)
    }
    # If pattern contains stringr modifiers, then use them, else will just use pattern_type as the string modifier
    is_stringr_modifier <- function(x) {
      isTRUE(all.equal(c("pattern", "character"), class(x)[2:3])) &&
        class(x)[1] %in% c("regex", "fixed", "coll")
    }
    if (length(pattern) == 1 & is_stringr_modifier(pattern)) pattern <- list(pattern)
    if (!is.list(pattern)) pattern <- as.list(pattern)
    stringr_modifier <- purrr::map_lgl(pattern, is_stringr_modifier)
    if (any(stringr_modifier)) {
      # Loop over patterns and find matching files
      my_files <- purrr::map_dfr(pattern, function(x) my_files %>%
                                   dplyr::filter(stringr::str_detect(.data[["file"]], x)) %>%
                                   dplyr::mutate(file_pattern = list(x)))
      # Using pattern_type
    } else {
      my_files <-  purrr::map_dfr(pattern, function(x) {
        x_pattern <- do.call(get(tolower(pattern_type), asNamespace("stringr")), list(x, ...))
        return(my_files %>%
                 dplyr::filter(stringr::str_detect(.data[["file"]], x_pattern)) %>%
                 dplyr::mutate(file_pattern = x_pattern))
      })
    }
    # For each pattern find latest file(s)
    if (latest_file) {
      my_files_out <- my_files %>%
        dplyr::group_by(.data[["file_pattern"]]) %>%
        dplyr::arrange(dplyr::desc(.data[["ctime"]])) %>%
        dplyr::slice_head(n = 1) %>%
        dplyr::mutate(before_date = max_date) %>%
        dplyr::ungroup()
    } else {
      my_files_out <- my_files %>%
        dplyr::mutate(before_date = max_date)
    }
    my_files_out %>%
      dplyr::mutate(file_type = dplyr::if_else(!.data[["isdir"]], # Get file extension
                                               stringr::str_extract(.data[["file"]], "(?<=\\.)[:alnum:]{1,}$"),
                                               "dir"))
  }
  # Extract files for each before element
  my_files_df <- purrr::map_dfr(before, file_extract) %>%
    dplyr::distinct(.data[["file"]], .keep_all = TRUE) %>%
    dplyr::select(file = .data[["file"]],
                  file_long = .data[["file_long"]],
                  file_type = .data[["file_type"]],
                  size = .data[["size"]],
                  date_created = .data[["ctime"]],
                  date_modified = .data[["mtime"]],
                  pattern_match = .data[["file_pattern"]])
  my_files_df
}

Nic-Chr avatar Jun 14 '21 11:06 Nic-Chr