pbapply
pbapply copied to clipboard
The pblapply is running slower and slower than expected. How should I fix it?
Hi, thanks for developing such a great tool! However, I ran into some problems with it when performing a large (kinda?) loop using pbapply
to accelerate the process. I did add gc()
and rm()
function in the loop, but it doesnt help and runs slower and slower as the loop number goes.
It seems that each CPU core it used to calculate it occupied more and more RAM than it should be for one loop though. In the beginning, each thread took ~9 RAM, then it grows larger without shrinking back.
Maybe I understand pbapply somewhere wrong, but I could not locate the problems. So any thoughts or suggestion would be appreciated! Here is the code I use. (The code is running on a ARM Macbook Pro (2023) with 128 RAM.)
gmb_files <- list.files(path = "~/ref/mbqtl/473/473hg19", pattern = "\\hg19.tsv.gz$", full.names = T)
cl <- makeCluster(4, type = "FORK")
res_stage1s <- pblapply(gmb_files, function(gmb){ # gmb = gmb_files[1]
# res_stage1s <- lapply(gmb_files[1:3], function(gmb){ # gmb = gmb_files[1]
gmb_id <- basename(gmb) %>% gsub("hg19.tsv.gz","",.); leo_message(paste("# Processing: ", gmb_id))
gmb_file <- fread(gmb) %>% mutate_at("CHR", as.integer)
res_one_gmbs <- lapply(1:nrow(loci), function(i){
# initial parameters
gwas_names <- c("T1D", "Iridocyclitis")
chr <- loci$CHR[i]
bp <- loci$BP[i]
win <- 500 # kb
gene <- loci$GENE[i]
Evidence <- ifelse(loci$Evidence[i]=="PLACO_COLOC","PC",loci$Evidence[i])
reminder <- paste0(Evidence,"_", gmb_id, "@", loci$CHR[i],":",loci$BP[i]-win*1000/2,"-",loci$BP[i]+win*1000/2)
# format hyprcoloc input
hypr_loci_dat <- format_hyprc_dat(gwas_names, chr, bp, win=win,
gmb=T, gmb_file=gmb_file, gmb_id=gmb_id)
beta_matrix <- hypr_loci_dat$beta_matrix
se_matrix <- hypr_loci_dat$se_matrix
snp_id = rownames(beta_matrix); leo_message(paste("\nSNP length:", length(snp_id)))
# hyprcoloc
binary_traits = c(1,1,0)
hypr_result <- hyprcoloc::hyprcoloc(beta_matrix, se_matrix,
trait.names = colnames(beta_matrix), snp.id = rownames(beta_matrix),
binary.outcomes = binary_traits)
reg.res <- hypr_result$results %>%
mutate(index = reminder, chr = chr, bp = bp, gmb_id = gmb_id, gene = gene) %>%
select(index, chr, bp, gmb_id, gene, everything())
# sensitivity
.check_hyprcoloc_postive <- function(reg.res) {
coloced_traits <- strsplit(reg.res$traits, ",")[[1]] %>% length()
logi1 <- nrow(reg.res) == 1
# logi2 <- reg.res$traits != "None"
# logi3 <- !is.na(reg.res$posterior_prob) && reg.res$posterior_prob > 0.25
logi4 <- coloced_traits == 3
logi = logi1 && logi4
if (logi) {return(TRUE)} else {return(FALSE)}
}
if (.check_hyprcoloc_postive(reg.res)) {
sen <- hyprcoloc::sensitivity.plot(beta_matrix, se_matrix,
trait.names = colnames(beta_matrix), snp.id = rownames(beta_matrix),
reg.thresh = c(0.5, 0.6, 0.7), align.thresh = c(0.5, 0.6, 0.7), prior.c = c(0.05, 0.02, 0.01, 0.005),
equal.thresholds = FALSE, similarity.matrix = TRUE)
sim.mat <- sen[[2]] # for self-defined pheatmap
save.path <- paste0("./figure/hyprcoloc/", reminder, ".pdf")
title <- paste0(loci$Evidence[i], "@", loci$CHR[i],":",loci$BP[i]-win*1000/2,"-",loci$BP[i]+win*1000/2)
p <- self_draw_hypr(sim.mat, title, save = T, save.path = save.path) # grid::grid.draw(p$gtable)
} else {message(paste0("# Failed to hyprcoloc for loci: ", reminder))}
# clean cache
rm(list = c("gwas", "chr", "bp", "win", "gene", "Evidence", "reminder",
"hypr_result", "beta_matrix", "se_matrix", "snp_id",
"sen", "sim.mat", "save.path", "title", "p")); gc()
return(reg.res)
})
res_one_gmb <- do.call("rbind", res_one_gmbs)
rm(gmb_file); rm(gmb_id); gc()
return(res_one_gmb)
}, cl = cl)
stopCluster(cl); rm(cl); gc()