ClustAssess
ClustAssess copied to clipboard
Genes with 0 expression appear in the Comparisons tab
Hi, While using ClustAssess on PBMC 3k dataset (https://www.10xgenomics.com/datasets/pbmc-from-a-healthy-donor-no-cell-sorting-3-k-1-standard-2-0-0), I noticed that genes with 0 expression are still plotted on UMAP in Comparison tab of ClustAssess ShinyApp.
Is this desired behavior or should it be fixed?
Input was prepared as follows:
library(Seurat)
preprocess_scRNAseq <- function(
file_path,
nfeat_min = 200,
nfeat_max = 3500,
ncount_max = 10000,
percent_mt_min = 7,
percent_mt_max = 25,
percent_rp_min = 3,
percent_rp_max = 25
) {
# Example of function usage:
# result <- preprocess_scRNAseq("path_to_file.h5", 200, 2500, 10000, 1, 10, 1, 10)
print(paste("Working on file", file_path))
# Load data
so1 <- Read10X_h5(file_path)
so1 <- CreateSeuratObject(counts = so1$"Gene Expression")
# Compute percentage of mitochondrial and ribosomal genes per cell
so1 <- PercentageFeatureSet(so1, pattern = "^MT-", col.name = "percent_mt")
so1 <- PercentageFeatureSet(so1, pattern = "^RP[SL][[:digit:]]", col.name = "percent_rp")
# Plot initial data characteristics
plt = VlnPlot(so1, features = c("nFeature_RNA", "nCount_RNA", "percent_mt", "percent_rp"), ncol = 4)
print(plt)
# Filtering based on user-defined thresholds
so1f <- subset(so1, subset = nFeature_RNA < nfeat_max & nFeature_RNA > nfeat_min &
nCount_RNA < ncount_max &
percent_mt < percent_mt_max & percent_mt > percent_mt_min &
percent_rp < percent_rp_max & percent_rp > percent_rp_min)
# Output the number of cells before and after filtering
print("Number of cells before filtering:")
print(ncol(so1))
print("Number of cells after filtering:")
print(ncol(so1f))
# Plot filtered data characteristics
plt = VlnPlot(so1f, features = c("nFeature_RNA", "nCount_RNA", "percent_mt", "percent_rp"), ncol = 4)
print(plt)
# Further clean-up: Removing mitochondrial and ribosomal genes
all.index <- seq_len(nrow(so1f))
MT.index <- grep(pattern = "^MT-", x = rownames(so1f), value = FALSE)
RP.index <- grep(pattern = "^RP[SL][[:digit:]]", x = rownames(so1f), value = FALSE)
print("Number of genes before MT & RP filtering:")
print(nrow(so1f))
so1f <- so1f[!((all.index %in% MT.index) | (all.index %in% RP.index)), ]
print("Number of genes after MT & RP filtering:")
print(nrow(so1f))
return(list(base = so1, filtered = so1f))
}
res1 <- preprocess_scRNAseq(
file1,
nfeat_min, nfeat_max, ncount_max, percent_mt_min, percent_mt_max, percent_rp_min, percent_rp_max)
so1 <- res1$base
# Try with the "full" dataset
so1 = SCTransform(
so1,
variable.features.n = n_abundant,
return.only.var.genes = FALSE,
verbose = FALSE
)
# Enable multi threading
RhpcBLASctl::blas_set_num_threads(1)
ncores <- 4
my_cluster <- parallel::makeCluster(
ncores,
type = "PSOCK"
)
doParallel::registerDoParallel(cl = my_cluster)
# Prepare HV and MA gene sets for ClustAssess
features <- dimnames(so1@assays$SCT)[[1]]
var_features <- so1@assays[["SCT"]]@var.features
most_abundant_genes <- rownames(so1@assays$SCT)[order(Matrix::rowSums(so1@assays$SCT),
decreasing = TRUE
)]
steps <- seq(from = 500, to = 3000, by = 500)
ma_hv_genes_intersection_sets <- sapply(steps, function(x) intersect(most_abundant_genes[1:x], var_features[1:x]))
ma_hv_genes_intersection <- Reduce(union, ma_hv_genes_intersection_sets)
ma_hv_steps <- sapply(ma_hv_genes_intersection_sets, length)
# Run ClustAssess
automm_output <- automatic_stability_assessment(
expression_matrix = so1@[email protected],
n_repetitions = 30,
n_neigh_sequence = seq(from = 5, to = 50, by = 5),
resolution_sequence = seq(from = 0.1, to = 1, by = 0.1), # can go up to 2
features_sets = list(
"HV" = var_features,
"MA" = most_abundant_genes[1:3000]
),
steps = list(
"HV" = steps,
"MA" = steps
),
n_top_configs = 2,
umap_arguments = list(
min_dist = 0.3,
n_neighbors = 30,
metric = "cosine"
),
save_temp = FALSE,
verbose = TRUE
)
write_shiny_app(
object = so1@assays$SCT@data,
metadata = [email protected],
clustassess_object = automm_output,
project_folder = "clustassess_app_dir_pmbc3k_full",
shiny_app_title = "PBMC 3k FULL dataset (3009 cells)"
)
shiny::runApp("clustassess_app_dir_pmbc3k_full")