diffcyt icon indicating copy to clipboard operation
diffcyt copied to clipboard

Fixed the calcCounts and calcMedians Functions and Added Option to Choose Which Clustering Is Used

Open LukaTandaric opened this issue 2 years ago • 1 comments

Dear Lukas,

I have fixed the calcCounts and calcMedians functions so that the rows and columns of SCEs are read properly. I have also added an option to pass an argument to the function that sets the (meta)clustering (from colnames(colData(SCE))) to be used when making the calculations. The argument has to be passed as a string (in "double quotes").

This function outputs counts per (meta)cluster per sample:

# Define the function that replaces calcCounts:
# SCE = The summarized experiment object the counts calculation will be done on.
# clustering = A (meta)clustering from colnames(colData(SCE)) (in quotes ""). The cell counts will be calculated for each (meta)cluster.
calcCountsFixed <- function (SCE, clustering = "cluster_id"){
    if (!is(SCE, "SummarizedExperiment")) {
        stop("Data object must be a 'SummarizedExperiment'.")
    }
    if (!(clustering %in% (colnames(colData(SCE))))) {
        stop("Data object does not contain the given clustering. Check colnames(colData(SCE)).")
    }
    coldata_df <- as.data.frame(colData(SCE))
    n_cells <- coldata_df %>% group_by(!!rlang::sym(clustering), sample_id, .drop = FALSE) %>% tally %>% complete(sample_id)
    n_cells <- acast(n_cells, paste0(clustering, " ~ sample_id"), value.var = "n", fill = 0)
    if (nrow(n_cells) < nlevels(colData(SCE)[[clustering]])) {
        missing_markers <- which(!(levels(colData(SCE)[[clustering]]) %in% rownames(n_cells)))
        n_cells_tmp <- matrix(0, nrow = length(missing_markers), ncol = ncol(n_cells))
        rownames(n_cells_tmp) <- missing_markers
        n_cells <- rbind(n_cells, n_cells_tmp)
        n_cells <- n_cells[order(as.numeric(rownames(n_cells))), , drop = FALSE]
    }
    n_cells_total <- rowSums(n_cells)
    row_data <- setNames(data.frame(factor(rownames(n_cells), levels = levels(colData(SCE)[[clustering]])), n_cells_total, stringsAsFactors = FALSE),
                         c(clustering, "n_cells"))
    col_data <- metadata(SCE)$experiment_info
    n_cells <- n_cells[, match(col_data$sample_id, colnames(n_cells)), drop = FALSE]
    stopifnot(all(col_data$sample_id == colnames(n_cells)))
    d_counts <- SummarizedExperiment(assays = list(counts = n_cells), 
                                     rowData = row_data,
                                     colData = col_data)
    d_counts
}

This function outputs the median signal values from the arcSinh-transformed signal values:

# Define the function that replaces calcMedians:
# SCE = The summarized experiment object the counts calculation will be done on.
# clustering = A (meta)clustering (in quotes "") from colnames(colData(SCE)). The cell counts will be calculated for each (meta)cluster.
calcMediansFixed <- function (SCE, clustering = "cluster_id"){
    if (!is(SCE, "SummarizedExperiment")) {
        stop("Data object must be a 'SummarizedExperiment'.")
    }
    if (!(clustering %in% (colnames(colData(SCE))))) {
        stop("Data object does not contain the given clustering. Check colnames(colData(SCE)).")
    }
    is_marker <- rowData(SCE)$marker_class != "none"
    id_type_markers <- (rowData(SCE)$marker_class == "type")[is_marker]
    id_state_markers <- (rowData(SCE)$marker_class == "state")[is_marker]
    assaydata_mx <- as.matrix(assays(SCE)[["exprs"]])
    medians <- vector("list", sum(is_marker))
    marker_names_sub <- as.character(rowData(SCE)$marker_name[is_marker])
    names(medians) <- marker_names_sub
    clus <- colData(SCE)[[clustering]]
    smp <- colData(SCE)$sample_id
    for (i in seq_along(medians)) {
        assaydata_i <- t(assaydata_mx[marker_names_sub[i], , drop = FALSE])
        assaydata_i <- as.data.frame(assaydata_i)
        temp_clus <- data.frame(clus)
        names(temp_clus) <- clustering
        assaydata_i <- cbind(assaydata_i, sample_id = smp, temp_clus)
        colnames(assaydata_i)[1] <- "value"
        med <- assaydata_i %>% group_by(!!rlang::sym(clustering), sample_id, .drop = FALSE) %>% summarize(median = median(value))
        med <- acast(med, paste0(clustering, " ~ sample_id"), value.var = "median", fill = NA)
        if (nrow(med) < nlevels(colData(SCE)[[clustering]])) {
            missing_markers <- which(!(levels(colData(SCE)[[clustering]]) %in% rownames(med)))
            med_tmp <- matrix(NA, nrow = length(missing_markers), ncol = ncol(med))
            rownames(med_tmp) <- missing_markers
            med <- rbind(med, med_tmp)
            med <- med[order(as.numeric(rownames(med))), , drop = FALSE]
        }
        medians[[i]] <- med
    }
    for (i in seq_along(medians)) {
        if (!all(rownames(medians[[i]]) == rownames(medians[[1]]))) {
            stop("Cluster IDs do not match")
        }
        if (!all(colnames(medians[[i]]) == colnames(medians[[1]]))) {
            stop("Sample IDs do not match")
        }
    }
    row_data <- setNames(data.frame(factor(rownames(medians[[1]]), levels = levels(colData(SCE)[[clustering]])), stringsAsFactors = FALSE),
                         c(clustering))
    col_data <- metadata(SCE)$experiment_info
    medians <- lapply(medians, function(m) {
        m[, match(col_data$sample_id, colnames(m)), drop = FALSE]
    })
    stopifnot(all(sapply(medians, function(m) {
        col_data$sample_id == colnames(m)
    })))
    metadata <- list(id_type_markers = id_type_markers, id_state_markers = id_state_markers)
    d_medians <- SummarizedExperiment(assays = medians, rowData = row_data, 
                                      colData = col_data, metadata = metadata)
    d_medians
}

This function outputs the median data made from raw signal values:

# The previous function outputs the median expression levels as arcSinh-transformed values with an argument of 5 (or which ever number
# you set as the value for the "cofactor" in "prepData"). In the case of CyTOF data, "cofactor = 5", which gives the formula "asinh(x/5)".
# In the following function, "[["exprs"]]" has been replaced by "[["counts"]]". This makes the function output raw, untransformed medians
# of marker expression. Be advised, arcSinh-transformed values should be used for statistical analyses due to being more comparable
# between markers and/or samples, unlike raw values, which can, for instance, vary from 5 to 5000.

# SCE = The summarized experiment object the counts calculation will be done on.
# clustering = A (meta)clustering (in quotes "") from colnames(colData(SCE)). The cell counts will be calculated for each (meta)cluster.
calcMediansFixed_raw <- function (SCE, clustering = "cluster_id"){
    if (!is(SCE, "SummarizedExperiment")) {
        stop("Data object must be a 'SummarizedExperiment'.")
    }
    if (!(clustering %in% (colnames(colData(SCE))))) {
        stop("Data object does not contain the given clustering. Check colnames(colData(SCE)).")
    }
    is_marker <- rowData(SCE)$marker_class != "none"
    id_type_markers <- (rowData(SCE)$marker_class == "type")[is_marker]
    id_state_markers <- (rowData(SCE)$marker_class == "state")[is_marker]
    assaydata_mx <- as.matrix(assays(SCE)[["counts"]])
    medians <- vector("list", sum(is_marker))
    marker_names_sub <- as.character(rowData(SCE)$marker_name[is_marker])
    names(medians) <- marker_names_sub
    clus <- colData(SCE)[[clustering]]
    smp <- colData(SCE)$sample_id
    for (i in seq_along(medians)) {
        assaydata_i <- t(assaydata_mx[marker_names_sub[i], , drop = FALSE])
        assaydata_i <- as.data.frame(assaydata_i)
        temp_clus <- data.frame(clus)
        names(temp_clus) <- clustering
        assaydata_i <- cbind(assaydata_i, sample_id = smp, temp_clus)
        colnames(assaydata_i)[1] <- "value"
        med <- assaydata_i %>% group_by(!!rlang::sym(clustering), sample_id, .drop = FALSE) %>% summarize(median = median(value))
        med <- acast(med, paste0(clustering, " ~ sample_id"), value.var = "median", fill = NA)
        if (nrow(med) < nlevels(colData(SCE)[[clustering]])) {
            missing_markers <- which(!(levels(colData(SCE)[[clustering]]) %in% rownames(med)))
            med_tmp <- matrix(NA, nrow = length(missing_markers), ncol = ncol(med))
            rownames(med_tmp) <- missing_markers
            med <- rbind(med, med_tmp)
            med <- med[order(as.numeric(rownames(med))), , drop = FALSE]
        }
        medians[[i]] <- med
    }
    for (i in seq_along(medians)) {
        if (!all(rownames(medians[[i]]) == rownames(medians[[1]]))) {
            stop("Cluster IDs do not match")
        }
        if (!all(colnames(medians[[i]]) == colnames(medians[[1]]))) {
            stop("Sample IDs do not match")
        }
    }
    row_data <- setNames(data.frame(factor(rownames(medians[[1]]), levels = levels(colData(SCE)[[clustering]])), stringsAsFactors = FALSE),
                         c(clustering))
    col_data <- metadata(SCE)$experiment_info
    medians <- lapply(medians, function(m) {
        m[, match(col_data$sample_id, colnames(m)), drop = FALSE]
    })
    stopifnot(all(sapply(medians, function(m) {
        col_data$sample_id == colnames(m)
    })))
    metadata <- list(id_type_markers = id_type_markers, id_state_markers = id_state_markers)
    d_medians <- SummarizedExperiment(assays = medians, rowData = row_data, 
                                      colData = col_data, metadata = metadata)
    d_medians
}

Please let me know if you decide to incorporate them into diffcyt.

Kind regards. Luka Tandaric

LukaTandaric avatar Nov 03 '23 10:11 LukaTandaric

I tested calcCountsFixed and it returns the results I attended on 1 example I am working with. I didn't tested the median computations. Thanks @LukaTandaric I think the choice of clustering is already in the original code of diffcyt https://github.com/lmweber/diffcyt/issues/56#issuecomment-1867010935

SamGG avatar Dec 21 '23 22:12 SamGG