CoordinateCleaner
CoordinateCleaner copied to clipboard
Issue in cd_round when ds >1
I found an error when the number of datasets to test is >1, and the columns do not match when some datasets cannot run the tests, while for some others you can. I attach here some bug fix hoping it helps
cc_round_v2<- function (x, lon = "decimallongitude", lat = "decimallatitude", ds = "dataset", T1 = 7, reg_out_thresh = 2, reg_dist_min = 0.1, reg_dist_max = 2, min_unique_ds_size = 4, graphs = F, test = "both", value = "flagged", verbose = TRUE) {
window_size <- 10 detection_rounding <- 2 detection_threshold <- 6 digit_round <- 0 nc <- 3000 rarefy <- FALSE match.arg(value, choices = c("flagged", "clean", "dataset","flagged2")) if (verbose) { message("Testing for rasterized collection") } if (length(unique(x[[ds]])) > 1) { dat <- split(x, f = x[[ds]]) out <- lapply(dat, function(k) {
tester <- k[complete.cases(k[, c(lon, lat)]), ]
if (nrow(tester[!duplicated(tester[, c(lon, lat)]),
]) < min_unique_ds_size) {
#adding info on the dataset with a problem
warning(paste0 (unique(k[[ds]])," :Dataset smaller than minimum test size"))
# out <- data.frame(dataset = unique(x[[ds]]),
# n.outliers = NA, n.regular.outliers = NA,
# regular.distance = NA, summary = NA)
out <- data.frame(dataset = unique(k[[ds]]),
n.outliers = NA, n.regular.outliers = NA,
regular.distance = NA, summary = NA)
}
else {
if (test == "lon") {
gvec <- CoordinateCleaner:::.CalcACT(data = k[[lon]], digit_round = digit_round,
nc = nc, graphs = graphs, graph_title = unique(k[[ds]]))
n_outl <- CoordinateCleaner:::.OutDetect(gvec, T1 = T1, window_size = window_size,
detection_rounding = detection_rounding,
detection_threshold = detection_threshold,
graphs = graphs)
n_outl$flag <- !all(n_outl$n.outliers > 0,
n_outl$regular.distance >= reg_dist_min,
n_outl$regular.distance <= reg_dist_max,
n_outl$n.regular.outliers >= reg_out_thresh)
if (graphs) {
title(paste(unique(k[[ds]]), n_outl$flag,
sep = " - "))
}
n_outl <- data.frame(unique(k[[ds]]), n_outl)
names(n_outl) <- c("dataset", "lon.n.outliers",
"lon.n.regular.distance", "lon.regular.distance",
"summary")
}
if (test == "lat") {
gvec <- CoordinateCleaner:::.CalcACT(data = k[[lat]], digit_round = digit_round,
nc = nc, graphs = graphs, graph_title = unique(k[[ds]]))
n_outl <- CoordinateCleaner:::.OutDetect(gvec, T1 = T1, window_size = window_size,
detection_rounding = detection_rounding,
detection_threshold = detection_threshold,
graphs = graphs)
n_outl$flag <- !all(n_outl$n.outliers > 0,
n_outl$regular.distance >= reg_dist_min,
n_outl$regular.distance <= reg_dist_max,
n_outl$n.regular.outliers >= reg_out_thresh)
if (graphs) {
title(paste(unique(k[[ds]]), n_outl$flag,
sep = " - "))
}
n_outl <- data.frame(unique(k[[ds]]), n_outl)
names(n_outl) <- c("dataset", "lat.n.outliers",
"lat.n.regular.distance", "lat.regular.distance",
"summary")
}
if (test == "both") {
gvec1 <- CoordinateCleaner:::.CalcACT(data = k[[lon]], digit_round = digit_round,
nc = nc, graphs = graphs, graph_title = unique(k[[ds]]))
n_outl_lon <- CoordinateCleaner:::.OutDetect(gvec1, T1 = T1, window_size = window_size,
detection_rounding = detection_rounding,
detection_threshold = detection_threshold,
graphs = graphs)
n_outl_lon$flag <- !all(n_outl_lon$n.outliers >
0, n_outl_lon$regular.distance >= reg_dist_min,
n_outl_lon$regular.distance <= reg_dist_max,
n_outl_lon$n.regular.outliers >= reg_out_thresh)
if (graphs) {
title(paste(unique(k[[ds]]), n_outl_lon$flag,
sep = " - "))
}
gvec2 <- CoordinateCleaner:::.CalcACT(data = k[[lat]], digit_round = digit_round,
nc = nc, graphs = graphs, graph_title = unique(k[[ds]]))
n_outl_lat <- CoordinateCleaner:::.OutDetect(gvec2, T1 = T1, window_size = window_size,
detection_rounding = detection_rounding,
detection_threshold = detection_threshold,
graphs = graphs)
n_outl_lat$flag <- !all(n_outl_lat$n.outliers >
0, n_outl_lat$regular.distance >= reg_dist_min,
n_outl_lat$regular.distance <= reg_dist_max,
n_outl_lat$n.regular.outliers >= reg_out_thresh)
if (graphs) {
title(paste(unique(k[[ds]]), n_outl_lat$flag,
sep = " - "))
}
n_outl <- cbind(unique(k[[ds]]), n_outl_lon,
n_outl_lat)
names(n_outl) <- c("dataset", "lon.n.outliers",
"lon.n.regular.outliers", "lon.regular.distance",
"lon.flag", "lat.n.outliers", "lat.n.regular.outliers",
"lat.regular.distance", "lat.flag")
n_outl$summary <- n_outl$lon.flag | n_outl$lat.flag
}
return(n_outl)
}
})
#out <- do.call("rbind.data.frame", out)
out <- do.call("bind_rows", out)
} else { if (nrow(x[!duplicated(x[, c(lon, lat)]), ]) < min_unique_ds_size) { warning("Dataset smaller than minimum test size") out <- data.frame(dataset = unique(x[[ds]]), n.outliers = NA, n.regular.outliers = NA, regular.distance = NA, summary = NA) } else { if (test == "lon") { gvec <- CoordinateCleaner:::.CalcACT(data = x[[lon]], digit_round = digit_round, nc = nc, graphs = graphs, graph_title = unique(x[[ds]])) n_outl <- CoordinateCleaner:::.OutDetect(gvec, T1 = T1, window_size = window_size, detection_rounding = detection_rounding, detection_threshold = detection_threshold, graphs = graphs) n_outl$flag <- !all(n_outl$n.outliers > 0, n_outl$regular.distance >= reg_dist_min, n_outl$regular.distance <= reg_dist_max, n_outl$n.regular.outliers >= reg_out_thresh) if (graphs) { title(paste(unique(x[[ds]]), n_outl$flag, sep = " - ")) } n_outl <- data.frame(unique(x[[ds]]), n_outl) names(n_outl) <- c("dataset", "lon.n.outliers", "lon.n.regular.distance", "lon.regular.distance", "summary") } if (test == "lat") { gvec <- CoordinateCleaner:::.CalcACT(data = x[[lat]], digit_round = digit_round, nc = nc, graphs = graphs, graph_title = unique(x[[ds]])) n_outl <- CoordinateCleaner:::.OutDetect(gvec, T1 = T1, window_size = window_size, detection_rounding = detection_rounding, detection_threshold = detection_threshold, graphs = graphs) n_outl$flag <- !all(n_outl$n.outliers > 0, n_outl$regular.distance >= reg_dist_min, n_outl$regular.distance <= reg_dist_max, n_outl$n.regular.outliers >= reg_out_thresh) if (graphs) { title(paste(unique(x[[ds]]), n_outl$flag, sep = " - ")) } n_outl <- data.frame(unique(x[[ds]]), n_outl) names(n_outl) <- c("dataset", "lat.n.outliers", "lat.n.regular.distance", "lat.regular.distance", "summary") } if (test == "both") { gvec1 <- CoordinateCleaner:::.CalcACT(data = x[[lon]], digit_round = digit_round, nc = nc, graphs = graphs, graph_title = unique(x[[ds]])) n_outl_lon <- CoordinateCleaner:::.OutDetect(gvec1, T1 = T1, window_size = window_size, detection_rounding = detection_rounding, detection_threshold = detection_threshold, graphs = graphs) n_outl_lon$flag <- !all(n_outl_lon$n.outliers > 0, n_outl_lon$regular.distance >= reg_dist_min, n_outl_lon$regular.distance <= reg_dist_max, n_outl_lon$n.regular.outliers >= reg_out_thresh) if (graphs) { title(paste(unique(x[[ds]]), n_outl_lon$flag, sep = " - ")) } gvec2 <- CoordinateCleaner:::.CalcACT(data = x[[lat]], digit_round = digit_round, nc = nc, graphs = graphs, graph_title = unique(x[[ds]])) n_outl_lat <- CoordinateCleaner:::.OutDetect(gvec2, T1 = T1, window_size = window_size, detection_rounding = detection_rounding, detection_threshold = detection_threshold, graphs = graphs) n_outl_lat$flag <- !all(n_outl_lat$n.outliers > 0, n_outl_lat$regular.distance >= reg_dist_min, n_outl_lat$regular.distance <= reg_dist_max, n_outl_lat$n.regular.outliers >= reg_out_thresh) if (graphs) { title(paste(unique(x[[ds]]), n_outl_lat$flag, sep = " - ")) } n_outl <- data.frame(unique(x[[ds]]), n_outl_lon, n_outl_lat) names(n_outl) <- c("dataset", "lon.n.outliers", "lon.n.regular.distance", "lon.regular.distance", "lon.flag", "lat.n.outliers", "lat.n.regular.distance", "lat.regular.distance", "lat.flag") n_outl$summary <- n_outl$lon.flag | n_outl$lat.flag } out <- n_outl } } #adding an option flagged2 as output value when a test has not been run (NA) , instead of having an F out2 = merge (x,out[c('dataset','summary')], by.x= ds,by.y='dataset',all.x=T) switch(value, dataset = return(out), clean = return({ test <- x[x[[ds]] %in% out[out$summary, "dataset"], ] if (length(test) > 0) { test } else { NULL } }), flagged = return(x[[ds]] %in% out[out$summary, "dataset"]), flagged2 = return(out2$summary) ) }
I'm getting errors at all my cd_round()
attempts too, and I think it's the same issue (maybe also related to #66). Here's a small reproducible example:
occs <- rgbif::occ_search(scientificName = "Daboia mauritanica")
# this runs (with ds="species", to simulate a single dataset):
CoordinateCleaner::cd_round(occs$data, lon = "decimalLongitude", lat = "decimalLatitude", ds = "species", value = "dataset")
# but this fails (with ds="datasetKey", which is what we need to do):
CoordinateCleaner::cd_round(occs$data, lon = "decimalLongitude", lat = "decimalLatitude", ds = "datasetKey", value = "dataset")
# Error in rbind.data.frame(`1c5c3e48-7fc0-4d4f-96e6-c4df2c747f34` = list( :
# numbers of columns of arguments do not match
# In addition: There were 15 warnings (use warnings() to see them)
warnings()
# Warning messages:
# 1: In FUN(X[[i]], ...) : Dataset smaller than minimum test size
# 2: In FUN(X[[i]], ...) : Dataset smaller than minimum test size
[...]