MicrobiomeStat
MicrobiomeStat copied to clipboard
mStat_generate_report_single(): error in linda() call
mStat_generate_report_single()
fails due invalid call to linda()
in MicrobiomeStat_1.1.3
:
processing file: file10828c3bf4e.Rmd
|........................ | 78% [taxa-test-execution]
Quitting from lines 615-626 [taxa-test-execution] (file10828c3bf4e.Rmd)
Error in `linda()`:
! unused arguments (feature.sig.level = 0.2, feature.mt.method = "fdr")
Backtrace:
1. MicrobiomeStat::generate_taxa_test_single(...)
2. base::lapply(...)
3. MicrobiomeStat (local) FUN(X[[i]], ...)
When I replace linda function with a modified version accepting accessory arguments (this code is a copy of https://github.com/cafferychen777/MicrobiomeStat/blob/main/R/linda.R, only difference with the current linda()
version is the accessory argument ...
after verbose = TRUE
in function call), mStat_generate_report_single()
runs normally until the end.
Modified linda version:
linda_modified <- function(feature.dat, meta.dat, phyloseq.obj = NULL, formula, feature.dat.type = c("count", "proportion","other"),
prev.filter = 0, mean.abund.filter = 0, max.abund.filter = 0,
is.winsor = TRUE, outlier.pct = 0.03,
adaptive = TRUE, zero.handling = c("pseudo-count", "imputation"),
pseudo.cnt = 0.5, corr.cut = 0.1,
p.adj.method = "BH", alpha = 0.05,
n.cores = 1, verbose = TRUE, ...) {
############################################
# only difference with linda() is the accessory argument `...` after verbose = TRUE
############################################
feature.dat.type <- match.arg(feature.dat.type)
if (!is.null(phyloseq.obj)) {
feature.dat.type <- "count"
feature.dat <- phyloseq.obj@otu_table %>%
as.data.frame() %>%
as.matrix()
meta.dat <- phyloseq.obj@sam_data %>% as.matrix() %>%
as.data.frame()
}
if (any(is.na(feature.dat))) {
stop(
"The feature table contains NA values. Please remove or handle them before proceeding.\n"
)
}
allvars <- all.vars(as.formula(formula))
Z <- as.data.frame(meta.dat[, allvars])
###############################################################################
# Filter sample
keep.sam <- which(rowSums(is.na(Z)) == 0)
Y <- feature.dat[, keep.sam]
Z <- as.data.frame(Z[keep.sam, ])
names(Z) <- allvars
# Filter features
temp <- t(t(Y) / colSums(Y))
if (feature.dat.type == "other" & (max.abund.filter != 0 | mean.abund.filter != 0 | prev.filter != 0 )){
message("Note: Since feature.dat.type is set to 'other', all filters (max.abund.filter, mean.abund.filter, and prev.filter) are reset to 0.")
max.abund.filter <- 0
mean.abund.filter <- 0
prev.filter <- 0
}
keep.tax <- rowMeans(temp != 0) >= prev.filter & rowMeans(temp) >= mean.abund.filter & matrixStats::rowMaxs(temp) >= max.abund.filter
names(keep.tax) <- rownames(Y)
rm(temp)
if (verbose) {
message(
sum(!keep.tax), " features are filtered!\n"
)
}
Y <- Y[keep.tax, ]
n <- ncol(Y)
m <- nrow(Y)
## some samples may have zero total counts after screening taxa
if (any(colSums(Y) == 0)) {
ind <- which(colSums(Y) > 0)
Y <- Y[, ind]
Z <- as.data.frame(Z[ind, ])
names(Z) <- allvars
keep.sam <- keep.sam[ind]
n <- ncol(Y)
}
if (verbose) {
message(
"The filtered data has ", n, " samples and ", m, " features that will be tested!\n"
)
}
if (sum(rowSums(Y != 0) <= 2) != 0) {
warning(
"Some features have less than 3 nonzero values!\n",
"They have virtually no statistical power. You may consider filtering them in the analysis!\n"
)
}
###############################################################################
## scaling numerical variables
ind <- sapply(1:ncol(Z), function(i) is.numeric(Z[, i]))
Z[, ind] <- scale(Z[, ind])
## winsorization
if (is.winsor) {
Y <- winsor.fun(Y, 1 - outlier.pct, feature.dat.type)
}
##
if (grepl("\\(", formula)) {
random.effect <- TRUE
} else {
random.effect <- FALSE
}
if (is.null(rownames(feature.dat))) {
taxa.name <- (1:nrow(feature.dat))[keep.tax]
} else {
taxa.name <- rownames(feature.dat)[keep.tax]
}
if (is.null(rownames(meta.dat))) {
samp.name <- (1:nrow(meta.dat))[keep.sam]
} else {
samp.name <- rownames(meta.dat)[keep.sam]
}
## handling zeros
if (feature.dat.type == "count") {
if (any(Y == 0)) {
N <- colSums(Y)
if (adaptive) {
logN <- log(N)
if (random.effect) {
tmp <- lmer(as.formula(paste0("logN", formula)), Z)
} else {
tmp <- lm(as.formula(paste0("logN", formula)), Z)
}
corr.pval <- coef(summary(tmp))[-1, "Pr(>|t|)"]
if (any(corr.pval <= corr.cut)) {
if (verbose) {
message("Imputation approach is used.")
}
zero.handling <- "Imputation"
} else {
if (verbose) {
message("Pseudo-count approach is used.")
}
zero.handling <- "Pseudo-count"
}
}
if (zero.handling == "imputation") {
N.mat <- matrix(rep(N, m), nrow = m, byrow = TRUE)
N.mat[Y > 0] <- 0
tmp <- N[max.col(N.mat)]
Y <- Y + N.mat / tmp
} else {
Y <- Y + pseudo.cnt
}
}
}
if (feature.dat.type == "proportion") {
if (any(Y == 0)) {
# Half minimum approach
Y <- t(apply(Y, 1, function(x) {
x[x == 0] <- 0.5 * min(x[x != 0])
return(x)
}))
colnames(Y) <- samp.name
rownames(Y) <- taxa.name
}
}
## CLR transformation
logY <- log2(Y)
W <- t(logY) - colMeans(logY)
## linear regression
# oldw <- getOption('warn')
# options(warn = -1)
if (!random.effect) {
if (verbose) {
message("Fit linear models ...")
}
suppressMessages(fit <- lm(as.formula(paste0("W", formula)), Z))
res <- do.call(rbind, coef(summary(fit)))
df <- rep(n - ncol(model.matrix(fit)), m)
} else {
if (verbose) {
message("Fit linear mixed effects models ...")
}
fun <- function(i) {
w <- W[, i]
suppressMessages(fit <- lmer(as.formula(paste0("w", formula)), Z))
coef(summary(fit))
}
if (n.cores > 1) {
res <- mclapply(c(1:m), function(i) fun(i), mc.cores = n.cores)
} else {
suppressMessages(res <- foreach(i = 1:m) %do% fun(i))
}
res <- do.call(rbind, res)
}
# options(warn = oldw)
res.intc <- res[which(rownames(res) == "(Intercept)"), ]
rownames(res.intc) <- NULL
baseMean <- 2^res.intc[, 1]
baseMean <- baseMean / sum(baseMean) * 1e6
output.fun <- function(x) {
res.voi <- res[which(rownames(res) == x), ]
rownames(res.voi) <- NULL
if (random.effect) {
df <- res.voi[, 3]
}
log2FoldChange <- res.voi[, 1]
lfcSE <- res.voi[, 2]
# oldw <- getOption('warn')
# options(warn = -1)
suppressMessages(bias <- modeest::mlv(sqrt(n) * log2FoldChange,
method = "meanshift", kernel = "gaussian"
) / sqrt(n))
# options(warn = oldw)
log2FoldChange <- log2FoldChange - bias
stat <- log2FoldChange / lfcSE
pvalue <- 2 * pt(-abs(stat), df)
padj <- p.adjust(pvalue, method = p.adj.method)
reject <- padj <= alpha
output <- cbind.data.frame(baseMean, log2FoldChange, lfcSE, stat, pvalue, padj, reject, df)
rownames(output) <- taxa.name
return(list(bias = bias, output = output))
}
variables <- unique(rownames(res))[-1]
variables.n <- length(variables)
bias <- rep(NA, variables.n)
output <- list()
for (i in 1:variables.n) {
tmp <- output.fun(variables[i])
output[[i]] <- tmp[[2]]
bias[i] <- tmp[[1]]
}
names(output) <- variables
rownames(Y) <- taxa.name
colnames(Y) <- samp.name
rownames(Z) <- samp.name
if (verbose) {
message("Completed.")
}
return(list(variables = variables, bias = bias, output = output, feature.dat.use = Y, meta.dat.use = Z))
}
Replacing linda version in the MicrobiomeStat namespace:
assignInNamespace("linda", linda_modified, ns="MicrobiomeStat")
I also need to define a winsor.fun
in the current environment -this is just a copy of the winsor.fun
found in https://github.com/cafferychen777/MicrobiomeStat/blob/main/R/linda.R-
winsor.fun <- function(Y, quan, feature.dat.type) {
# 如果feature.dat.type是 "count"
if (feature.dat.type == "count") {
# 计算矩阵Y每列的和,保存在变量N中
N <- colSums(Y)
# 将Y矩阵中的每个元素除以其相应列的和,得到矩阵P
# 对矩阵P进行两次转置以得到与Y相同的维度
P <- t(t(Y) / N)
# 对P矩阵的每行应用quantile函数,以quan为参数,得到一个向量cut
cut <- apply(P, 1, quantile, quan)
# 复制cut向量以创建一个与Y矩阵相同维度的矩阵Cut
Cut <- matrix(rep(cut, ncol(Y)), nrow(Y))
# 创建一个逻辑矩阵ind
# 其中每个元素为TRUE,如果相应的P矩阵元素大于相应的Cut矩阵元素
ind <- P > Cut
# 将P矩阵中大于Cut矩阵中对应元素的值,用Cut矩阵中的对应元素替换
P[ind] <- Cut[ind]
# 四舍五入P矩阵中的每个元素,并将结果保存在Y矩阵中
Y <- round(t(t(P) * N))
}
# 如果feature.dat.type是 "proportion"
if (feature.dat.type == "proportion") {
# 对Y矩阵的每行应用quantile函数,以quan为参数,得到一个向量cut
cut <- apply(Y, 1, quantile, quan)
# 复制cut向量以创建一个与Y矩阵相同维度的矩阵Cut
Cut <- matrix(rep(cut, ncol(Y)), nrow(Y))
# 创建一个逻辑矩阵ind
# 其中每个元素为TRUE,如果相应的Y矩阵元素大于相应的Cut矩阵元素
ind <- Y > Cut
# 将Y矩阵中大于Cut矩阵中对应元素的值,用Cut矩阵中的对应元素替换
Y[ind] <- Cut[ind]
}
# 返回修改后的Y矩阵
return(Y)
}
sessionInfo()
:
R version 4.3.2 (2023-10-31)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 20.04.5 LTS
Matrix products: default
BLAS/LAPACK: /home/sciliciumtheo/miniconda3/envs/R_microbiomestat/lib/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
locale:
[1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
[4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
[7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Paris
tzcode source: system (glibc)
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] MicrobiomeStat_1.1.3 tibble_3.2.1 rlang_1.1.3
[4] microbiome_1.24.0 ggplot2_3.4.4 phyloseq_1.46.0
loaded via a namespace (and not attached):
[1] bitops_1.0-7 inline_0.3.19 permute_0.9-7
[4] magrittr_2.0.3 clue_0.3-65 ade4_1.7-22
[7] matrixStats_1.2.0 compiler_4.3.2 mgcv_1.9-1
[10] systemfonts_1.0.5 vctrs_0.6.5 reshape2_1.4.4
[13] rmutil_1.1.10 stringr_1.5.1 pkgconfig_2.0.3
[16] crayon_1.5.2 fastmap_1.1.1 XVector_0.42.0
[19] labeling_0.4.3 pander_0.6.5 utf8_1.2.4
[22] modeest_2.4.0 rmarkdown_2.25 nloptr_2.0.3
[25] ragg_1.2.7 tinytex_0.49 purrr_1.0.2
[28] xfun_0.41 cachem_1.0.8 zlibbioc_1.48.0
[31] aplot_0.2.2 GenomeInfoDb_1.38.1 jsonlite_1.8.8
[34] biomformat_1.30.0 rhdf5filters_1.14.1 Rhdf5lib_1.24.0
[37] parallel_4.3.2 cluster_2.1.6 R6_2.5.1
[40] RColorBrewer_1.1-3 stringi_1.8.3 boot_1.3-28.1
[43] rpart_4.1.23 numDeriv_2016.8-1.1 Rcpp_1.0.12
[46] iterators_1.0.14 knitr_1.45 IRanges_2.36.0
[49] Matrix_1.6-5 splines_4.3.2 igraph_1.6.0
[52] tidyselect_1.2.0 yaml_2.3.8 vegan_2.6-4
[55] timeDate_4032.109 codetools_0.2-19 lattice_0.22-5
[58] lmerTest_3.1-3 plyr_1.8.9 Biobase_2.62.0
[61] withr_3.0.0 evaluate_0.23 stable_1.1.6
[64] Rtsne_0.17 gridGraphics_0.5-1 survival_3.5-7
[67] Biostrings_2.70.1 pillar_1.9.0 foreach_1.5.2
[70] stats4_4.3.2 ggfun_0.1.4 generics_0.1.3
[73] RCurl_1.98-1.14 S4Vectors_0.40.2 munsell_0.5.0
[76] scales_1.3.0 minqa_1.2.6 timeSeries_4032.109
[79] glue_1.7.0 statip_0.2.3 pheatmap_1.0.12
[82] tools_4.3.2 data.table_1.14.10 lme4_1.1-35.1
[85] spatial_7.3-17 fBasics_4032.96 forcats_1.0.0
[88] fs_1.6.3 rhdf5_2.46.1 grid_4.3.2
[91] tidyr_1.3.1 ape_5.7-1 colorspace_2.1-0
[94] patchwork_1.2.0 nlme_3.1-164 GenomeInfoDbData_1.2.11
[97] cli_3.6.2 textshaping_0.3.7 fansi_1.0.6
[100] dplyr_1.1.4 ggh4x_0.2.8 gtable_0.3.4
[103] yulab.utils_0.1.4 stabledist_0.7-1 digest_0.6.34
[106] BiocGenerics_0.48.1 ggrepel_0.9.5 ggplotify_0.1.2
[109] farver_2.1.1 memoise_2.0.1 htmltools_0.5.7
[112] multtest_2.58.0 lifecycle_1.0.4 statmod_1.5.0
[115] GUniFrac_1.8 MASS_7.3-60