diffcyt
diffcyt copied to clipboard
Fixed the calcCounts and calcMedians Functions and Added Option to Choose Which Clustering Is Used
Dear Lukas,
I have fixed the calcCounts and calcMedians functions so that the rows and columns of SCEs are read properly.
I have also added an option to pass an argument to the function that sets the (meta)clustering (from colnames(colData(SCE))) to be used when making the calculations. The argument has to be passed as a string (in "double quotes").
This function outputs counts per (meta)cluster per sample:
# Define the function that replaces calcCounts:
# SCE = The summarized experiment object the counts calculation will be done on.
# clustering = A (meta)clustering from colnames(colData(SCE)) (in quotes ""). The cell counts will be calculated for each (meta)cluster.
calcCountsFixed <- function (SCE, clustering = "cluster_id"){
if (!is(SCE, "SummarizedExperiment")) {
stop("Data object must be a 'SummarizedExperiment'.")
}
if (!(clustering %in% (colnames(colData(SCE))))) {
stop("Data object does not contain the given clustering. Check colnames(colData(SCE)).")
}
coldata_df <- as.data.frame(colData(SCE))
n_cells <- coldata_df %>% group_by(!!rlang::sym(clustering), sample_id, .drop = FALSE) %>% tally %>% complete(sample_id)
n_cells <- acast(n_cells, paste0(clustering, " ~ sample_id"), value.var = "n", fill = 0)
if (nrow(n_cells) < nlevels(colData(SCE)[[clustering]])) {
missing_markers <- which(!(levels(colData(SCE)[[clustering]]) %in% rownames(n_cells)))
n_cells_tmp <- matrix(0, nrow = length(missing_markers), ncol = ncol(n_cells))
rownames(n_cells_tmp) <- missing_markers
n_cells <- rbind(n_cells, n_cells_tmp)
n_cells <- n_cells[order(as.numeric(rownames(n_cells))), , drop = FALSE]
}
n_cells_total <- rowSums(n_cells)
row_data <- setNames(data.frame(factor(rownames(n_cells), levels = levels(colData(SCE)[[clustering]])), n_cells_total, stringsAsFactors = FALSE),
c(clustering, "n_cells"))
col_data <- metadata(SCE)$experiment_info
n_cells <- n_cells[, match(col_data$sample_id, colnames(n_cells)), drop = FALSE]
stopifnot(all(col_data$sample_id == colnames(n_cells)))
d_counts <- SummarizedExperiment(assays = list(counts = n_cells),
rowData = row_data,
colData = col_data)
d_counts
}
This function outputs the median signal values from the arcSinh-transformed signal values:
# Define the function that replaces calcMedians:
# SCE = The summarized experiment object the counts calculation will be done on.
# clustering = A (meta)clustering (in quotes "") from colnames(colData(SCE)). The cell counts will be calculated for each (meta)cluster.
calcMediansFixed <- function (SCE, clustering = "cluster_id"){
if (!is(SCE, "SummarizedExperiment")) {
stop("Data object must be a 'SummarizedExperiment'.")
}
if (!(clustering %in% (colnames(colData(SCE))))) {
stop("Data object does not contain the given clustering. Check colnames(colData(SCE)).")
}
is_marker <- rowData(SCE)$marker_class != "none"
id_type_markers <- (rowData(SCE)$marker_class == "type")[is_marker]
id_state_markers <- (rowData(SCE)$marker_class == "state")[is_marker]
assaydata_mx <- as.matrix(assays(SCE)[["exprs"]])
medians <- vector("list", sum(is_marker))
marker_names_sub <- as.character(rowData(SCE)$marker_name[is_marker])
names(medians) <- marker_names_sub
clus <- colData(SCE)[[clustering]]
smp <- colData(SCE)$sample_id
for (i in seq_along(medians)) {
assaydata_i <- t(assaydata_mx[marker_names_sub[i], , drop = FALSE])
assaydata_i <- as.data.frame(assaydata_i)
temp_clus <- data.frame(clus)
names(temp_clus) <- clustering
assaydata_i <- cbind(assaydata_i, sample_id = smp, temp_clus)
colnames(assaydata_i)[1] <- "value"
med <- assaydata_i %>% group_by(!!rlang::sym(clustering), sample_id, .drop = FALSE) %>% summarize(median = median(value))
med <- acast(med, paste0(clustering, " ~ sample_id"), value.var = "median", fill = NA)
if (nrow(med) < nlevels(colData(SCE)[[clustering]])) {
missing_markers <- which(!(levels(colData(SCE)[[clustering]]) %in% rownames(med)))
med_tmp <- matrix(NA, nrow = length(missing_markers), ncol = ncol(med))
rownames(med_tmp) <- missing_markers
med <- rbind(med, med_tmp)
med <- med[order(as.numeric(rownames(med))), , drop = FALSE]
}
medians[[i]] <- med
}
for (i in seq_along(medians)) {
if (!all(rownames(medians[[i]]) == rownames(medians[[1]]))) {
stop("Cluster IDs do not match")
}
if (!all(colnames(medians[[i]]) == colnames(medians[[1]]))) {
stop("Sample IDs do not match")
}
}
row_data <- setNames(data.frame(factor(rownames(medians[[1]]), levels = levels(colData(SCE)[[clustering]])), stringsAsFactors = FALSE),
c(clustering))
col_data <- metadata(SCE)$experiment_info
medians <- lapply(medians, function(m) {
m[, match(col_data$sample_id, colnames(m)), drop = FALSE]
})
stopifnot(all(sapply(medians, function(m) {
col_data$sample_id == colnames(m)
})))
metadata <- list(id_type_markers = id_type_markers, id_state_markers = id_state_markers)
d_medians <- SummarizedExperiment(assays = medians, rowData = row_data,
colData = col_data, metadata = metadata)
d_medians
}
This function outputs the median data made from raw signal values:
# The previous function outputs the median expression levels as arcSinh-transformed values with an argument of 5 (or which ever number
# you set as the value for the "cofactor" in "prepData"). In the case of CyTOF data, "cofactor = 5", which gives the formula "asinh(x/5)".
# In the following function, "[["exprs"]]" has been replaced by "[["counts"]]". This makes the function output raw, untransformed medians
# of marker expression. Be advised, arcSinh-transformed values should be used for statistical analyses due to being more comparable
# between markers and/or samples, unlike raw values, which can, for instance, vary from 5 to 5000.
# SCE = The summarized experiment object the counts calculation will be done on.
# clustering = A (meta)clustering (in quotes "") from colnames(colData(SCE)). The cell counts will be calculated for each (meta)cluster.
calcMediansFixed_raw <- function (SCE, clustering = "cluster_id"){
if (!is(SCE, "SummarizedExperiment")) {
stop("Data object must be a 'SummarizedExperiment'.")
}
if (!(clustering %in% (colnames(colData(SCE))))) {
stop("Data object does not contain the given clustering. Check colnames(colData(SCE)).")
}
is_marker <- rowData(SCE)$marker_class != "none"
id_type_markers <- (rowData(SCE)$marker_class == "type")[is_marker]
id_state_markers <- (rowData(SCE)$marker_class == "state")[is_marker]
assaydata_mx <- as.matrix(assays(SCE)[["counts"]])
medians <- vector("list", sum(is_marker))
marker_names_sub <- as.character(rowData(SCE)$marker_name[is_marker])
names(medians) <- marker_names_sub
clus <- colData(SCE)[[clustering]]
smp <- colData(SCE)$sample_id
for (i in seq_along(medians)) {
assaydata_i <- t(assaydata_mx[marker_names_sub[i], , drop = FALSE])
assaydata_i <- as.data.frame(assaydata_i)
temp_clus <- data.frame(clus)
names(temp_clus) <- clustering
assaydata_i <- cbind(assaydata_i, sample_id = smp, temp_clus)
colnames(assaydata_i)[1] <- "value"
med <- assaydata_i %>% group_by(!!rlang::sym(clustering), sample_id, .drop = FALSE) %>% summarize(median = median(value))
med <- acast(med, paste0(clustering, " ~ sample_id"), value.var = "median", fill = NA)
if (nrow(med) < nlevels(colData(SCE)[[clustering]])) {
missing_markers <- which(!(levels(colData(SCE)[[clustering]]) %in% rownames(med)))
med_tmp <- matrix(NA, nrow = length(missing_markers), ncol = ncol(med))
rownames(med_tmp) <- missing_markers
med <- rbind(med, med_tmp)
med <- med[order(as.numeric(rownames(med))), , drop = FALSE]
}
medians[[i]] <- med
}
for (i in seq_along(medians)) {
if (!all(rownames(medians[[i]]) == rownames(medians[[1]]))) {
stop("Cluster IDs do not match")
}
if (!all(colnames(medians[[i]]) == colnames(medians[[1]]))) {
stop("Sample IDs do not match")
}
}
row_data <- setNames(data.frame(factor(rownames(medians[[1]]), levels = levels(colData(SCE)[[clustering]])), stringsAsFactors = FALSE),
c(clustering))
col_data <- metadata(SCE)$experiment_info
medians <- lapply(medians, function(m) {
m[, match(col_data$sample_id, colnames(m)), drop = FALSE]
})
stopifnot(all(sapply(medians, function(m) {
col_data$sample_id == colnames(m)
})))
metadata <- list(id_type_markers = id_type_markers, id_state_markers = id_state_markers)
d_medians <- SummarizedExperiment(assays = medians, rowData = row_data,
colData = col_data, metadata = metadata)
d_medians
}
Please let me know if you decide to incorporate them into diffcyt.
Kind regards. Luka Tandaric
I tested calcCountsFixed and it returns the results I attended on 1 example I am working with. I didn't tested the median computations. Thanks @LukaTandaric
I think the choice of clustering is already in the original code of diffcyt https://github.com/lmweber/diffcyt/issues/56#issuecomment-1867010935