clusterProfiler icon indicating copy to clipboard operation
clusterProfiler copied to clipboard

Add ssGSEA into clusterProfiler

Open Yunuuuu opened this issue 3 years ago • 2 comments

I have used GSVA and GenePattern or GenePattern for ssGSEA, GSVA didn't satisfy all need, it cannot compute the normalized enrichment scores using permutations, and ssGSEA in Gene Pattern isn't convenient in R, I have wrote some code based on GSVA, GenePattern and clusterProfiler, but it's too slow and inconvenient. Could u bring simily function with more efficient in clusterProfiler?

my slowly code is below(much based on clusterProfiler)

### AUR for area.under.RES, KS for Kolmogorov-Smirnov
gsea <- function(gene_list, gene_set, weight, statistic = c("AUR", "KS")){

    ###################################################################
    ##    geneList                                                   ##
    ##                                                               ##
    ## 1. Rank order the N genes in D to form L = { g_1, ... , g_N}  ##
    ##    according to the correlation, r(g_j)=r_j,                  ##
    ##    of their expression profiles with C.                       ##
    ##                                                               ##
    ###################################################################

    ###################################################################
    ##    exponent                                                   ##
    ##                                                               ##
    ## An weight p to control the weight of the step.                ##
    ##   When p = 0, Enrichment Score ( ES(S) ) reduces to           ##
    ##   the standard Kolmogorov-Smirnov statistic.                  ##
    ##   When p = 1, we are weighting the genes in S                 ##
    ##   by their correlation with C normalized                      ##
    ##   by the sum of the correlations over all of the genes in S.  ##
    ##                                                               ##
    ###################################################################


    ## genes defined in gene_set should appear in gene_list
    ## this is a must, see https://github.com/GuangchuangYu/DOSE/issues/23
    gene_set <- intersect(gene_set, names(gene_list))

    N <- length(gene_list)
    Nh <- length(gene_set)

    Phit <- Pmiss <- numeric(N)
    hits <- names(gene_list) %in% gene_set ## logical
    gene_list <- scale(gene_list, center = TRUE, scale = TRUE)

    Phit[hits] <- abs(gene_list[hits])^weight
    NR <- sum(Phit)
    Phit <- cumsum(Phit/NR)

    Pmiss[!hits] <-  1/(N-Nh)
    Pmiss <- cumsum(Pmiss)

    runningES <- Phit - Pmiss

    if (statistic == "KS") {
      ## ES is the maximum deviation from zero of Phit-Pmiss
      max.ES <- max(runningES)
      min.ES <- min(runningES)
      if( abs(max.ES) > abs(min.ES) ) {
        ES <- max.ES
      } else {
        ES <- min.ES
      }
    }
    if (statistic == "AUR") {
      # calculates the area under RES by adding up areas of individual
      # triangles + rectangles
      miss_auc <- 0.5 * ( Pmiss - c(0, Pmiss[-N]) ) + c(0, Pmiss[-N])
      hit_auc <- 0.5 * ( Phit - c(0, Phit[-N]) ) + c(0, Phit[-N])
      ES <- sum(hit_auc) - sum(miss_auc)
    }

    df <- data.frame(x=seq_along(runningES),
                     runningScore=runningES,
                     position = hits
    )
    df$gene = names(gene_list)

    res <- list(ES=ES, runningES = df)

    return(res)
  }

  perm_gene_set <- function(gene_list) {

    perm.idx <- sample.int(length(gene_list), replace = FALSE)
    perm_gene_set <- gene_list
    names(perm_gene_set) <- names(gene_list)[perm.idx]
    return(perm_gene_set)

  }

  perm_gsea_es <- function(gene_list, gene_set, weight, statistic) {

    gene_list <- perm_gene_set(gene_list)
    res <- gsea(

        gene_list = gene_list,
        gene_set = gene_set,
        weight = weight,
        statistic = statistic

      )$ES
    return(res)
  }

  project_to_geneset <- function(data_matrix, gene_set_list, weight, statistic, NES, nPerm, BPPARAM, verbose){

    gene_name <- rownames(data_matrix)
    message("running ssGSEA ", "using ", statistic, sep = "")
    es_res <- BiocParallel::bplapply(1:ncol(data_matrix), function(j) {

      ssgsea_data <- data_matrix[, j, drop = TRUE]
      names(ssgsea_data) <- gene_name
      ssgsea_data <- sort(ssgsea_data, decreasing = TRUE)

      if (verbose) {
        message_nes <- ""
        if (NES) message_nes <- ", NES and p-values"
        message(
          "calculating ES", message_nes, " for ",
          j, "th sample ", colnames(data_matrix)[j]
        )
      }

      enrich_score <- lapply(gene_set_list, function(gene_set){

        es <- gsea(
          gene_list = ssgsea_data,
          gene_set = gene_set,
          weight = weight,
          statistic = statistic
        )$ES

        if (NES) {
          perm_scores <- lapply(1:nPerm, function(i) {
            perm_gsea_es(
              gene_list = ssgsea_data,
              gene_set = gene_set,
              weight = weight,
              statistic = statistic
            )
          })

          perm_scores <- unlist(perm_scores)
          pos_m <- mean(perm_scores[perm_scores > 0])
          neg_m <- abs(mean(perm_scores[perm_scores < 0]))


          if (sign(es) == 1 || sign(es) == 0) m <- pos_m
          if (sign(es) == -1) m <- neg_m

          NES <- es/m


          if( is.na(NES) ) {
            pvals <- NA
          } else if ( NES >= 0 ) {
            pvals <- (sum(perm_scores >= NES) +1) / (sum(perm_scores >= 0)+1)
          } else { # NES < 0
            pvals <- (sum(perm_scores <= NES) +1) / (sum(perm_scores < 0)+1)
          }

          return(list(ES = es, NES = NES, P.values = pvals))
        }
        return(list(ES = es))

      })

      return(enrich_score)

    }, BPPARAM = BPPARAM)

    names(es_res) <- colnames(data_matrix)

    ES <- lapply(es_res, function(smp) {
      sample_es_vector <- lapply(smp, function(gene_set) gene_set$ES)
      return(unlist(sample_es_vector, use.names = TRUE))
    })
    ES <- do.call("cbind", ES)
    res <- list(ES = ES)
    if (NES) {
      NES <- lapply(es_res, function(smp) {
        sample_nes_vector <- lapply(smp, function(gene_set) gene_set$NES)
        return(unlist(sample_nes_vector, use.names = TRUE))
      })
      NES <- do.call("cbind", NES)

      P.values <- lapply(es_res, function(smp) {
        sample_pva_vector <- lapply(smp, function(gene_set) {
          return(gene_set$P.values)
        })
        return(unlist(sample_pva_vector, use.names = TRUE))
      })
      P.values <- do.call("cbind", P.values)

      res$NES <- NES
      res$P.values <- P.values
    }
    return(res)

  } # end of project_to_geneset

```

Yunuuuu avatar Dec 16 '20 03:12 Yunuuuu

Y叔,习惯用clusterProfiler做富集!希望也能单样本!!

Yunuuuu avatar Dec 16 '20 04:12 Yunuuuu

单样本我也希望能够做, @huerqiang

GuangchuangYu avatar Dec 16 '20 05:12 GuangchuangYu

谢谢Y叔,谢谢您 @huerqiang

Yunuuuu avatar Dec 16 '20 12:12 Yunuuuu