ClustAssess icon indicating copy to clipboard operation
ClustAssess copied to clipboard

Genes with 0 expression appear in the Comparisons tab

Open crisb420 opened this issue 2 months ago • 0 comments

Hi, While using ClustAssess on PBMC 3k dataset (https://www.10xgenomics.com/datasets/pbmc-from-a-healthy-donor-no-cell-sorting-3-k-1-standard-2-0-0), I noticed that genes with 0 expression are still plotted on UMAP in Comparison tab of ClustAssess ShinyApp.

Is this desired behavior or should it be fixed?

image

Input was prepared as follows:

library(Seurat)

preprocess_scRNAseq <- function(
    file_path,
    nfeat_min = 200,
    nfeat_max = 3500,
    ncount_max = 10000,
    percent_mt_min = 7,
    percent_mt_max = 25,
    percent_rp_min = 3,
    percent_rp_max = 25
) {

    # Example of function usage:
    # result <- preprocess_scRNAseq("path_to_file.h5", 200, 2500, 10000, 1, 10, 1, 10)
    print(paste("Working on file", file_path))

    # Load data
    so1 <- Read10X_h5(file_path)
    so1 <- CreateSeuratObject(counts = so1$"Gene Expression")

    # Compute percentage of mitochondrial and ribosomal genes per cell
    so1 <- PercentageFeatureSet(so1, pattern = "^MT-", col.name = "percent_mt")
    so1 <- PercentageFeatureSet(so1, pattern = "^RP[SL][[:digit:]]", col.name = "percent_rp")

    # Plot initial data characteristics
    plt = VlnPlot(so1, features = c("nFeature_RNA", "nCount_RNA", "percent_mt", "percent_rp"), ncol = 4)
    print(plt)

    # Filtering based on user-defined thresholds
    so1f <- subset(so1, subset = nFeature_RNA < nfeat_max & nFeature_RNA > nfeat_min &
                            nCount_RNA < ncount_max &
                            percent_mt < percent_mt_max & percent_mt > percent_mt_min &
                            percent_rp < percent_rp_max & percent_rp > percent_rp_min)

    # Output the number of cells before and after filtering
    print("Number of cells before filtering:")
    print(ncol(so1))
    print("Number of cells after filtering:")
    print(ncol(so1f))

    # Plot filtered data characteristics
    plt = VlnPlot(so1f, features = c("nFeature_RNA", "nCount_RNA", "percent_mt", "percent_rp"), ncol = 4)
    print(plt)

    # Further clean-up: Removing mitochondrial and ribosomal genes
    all.index <- seq_len(nrow(so1f))
    MT.index <- grep(pattern = "^MT-", x = rownames(so1f), value = FALSE)
    RP.index <- grep(pattern = "^RP[SL][[:digit:]]", x = rownames(so1f), value = FALSE)

    print("Number of genes before MT & RP filtering:")
    print(nrow(so1f))

    so1f <- so1f[!((all.index %in% MT.index) | (all.index %in% RP.index)), ]

    print("Number of genes after MT & RP filtering:")
    print(nrow(so1f))

    return(list(base = so1, filtered = so1f))
}


res1 <- preprocess_scRNAseq(
    file1,
    nfeat_min, nfeat_max, ncount_max, percent_mt_min, percent_mt_max, percent_rp_min, percent_rp_max)
so1 <- res1$base
# Try with the "full" dataset
so1 = SCTransform(
    so1,
    variable.features.n = n_abundant,
    return.only.var.genes = FALSE,
    verbose = FALSE
)

# Enable multi threading
RhpcBLASctl::blas_set_num_threads(1)
ncores <- 4
my_cluster <- parallel::makeCluster(
    ncores,
    type = "PSOCK"
)
doParallel::registerDoParallel(cl = my_cluster)

# Prepare HV and MA gene sets for ClustAssess
features <- dimnames(so1@assays$SCT)[[1]]
var_features <- so1@assays[["SCT"]]@var.features
most_abundant_genes <- rownames(so1@assays$SCT)[order(Matrix::rowSums(so1@assays$SCT),
    decreasing = TRUE
)]
steps <- seq(from = 500, to = 3000, by = 500)
ma_hv_genes_intersection_sets <- sapply(steps, function(x) intersect(most_abundant_genes[1:x], var_features[1:x]))
ma_hv_genes_intersection <- Reduce(union, ma_hv_genes_intersection_sets)
ma_hv_steps <- sapply(ma_hv_genes_intersection_sets, length)

# Run ClustAssess
automm_output <- automatic_stability_assessment(
    expression_matrix = so1@[email protected],
    n_repetitions = 30,
    n_neigh_sequence = seq(from = 5, to = 50, by = 5),
    resolution_sequence = seq(from = 0.1, to = 1, by = 0.1),   # can go up to 2
    features_sets = list(
        "HV" = var_features,
        "MA" = most_abundant_genes[1:3000]
    ),
    steps = list(
        "HV" = steps,
        "MA" = steps
    ),
    n_top_configs = 2,
    umap_arguments = list(
        min_dist = 0.3,
        n_neighbors = 30,
        metric = "cosine"
    ),
    save_temp = FALSE,
    verbose = TRUE
)

write_shiny_app(
    object = so1@assays$SCT@data,
    metadata = [email protected],
    clustassess_object = automm_output,
    project_folder = "clustassess_app_dir_pmbc3k_full",
    shiny_app_title = "PBMC 3k FULL dataset (3009 cells)"
)

shiny::runApp("clustassess_app_dir_pmbc3k_full")

crisb420 avatar Apr 18 '24 15:04 crisb420