simplifyEnrichment icon indicating copy to clipboard operation
simplifyEnrichment copied to clipboard

Assign a representative GO term to each cluster

Open eggrandio opened this issue 2 years ago • 3 comments

Hi jokergoo, thank you so much for developing such useful R packages!

I have been using semantic similarity to cluster enriched GO terms before but this package makes it much easier. I usually try to assign a "representative GO term" to each cluster, and I think in some cases it could be more helpful than a word cloud. Let me know what you think and if you have any suggestion on how to improve the assignment (and feel free to include it in the package if you want).

My idea is first to cluster GO terms by semantic similarity, then find common ancestral GO terms for each cluster, and retrieve a common ancestral term with high Information Content for each cluster (otherwise, very generic terms are returned). The tricky part is how to select a term that is both informative and shared by the majority of the GO terms in the cluster. I have calculated an very simple "importance" term that is n * IC^2 where n = number of times the ancestral GO appears in each GO cluster and IC = information content for that term. The "importance" calculation could probably be improved!

Best,

library(tidyverse)
library(data.table)
library(GOSemSim)
library(GO.db)
library(simplifyEnrichment)

# Object with ancestor GO terms
GO_anc = as.list(GOBPANCESTOR)

# Object to match GO ID with GO definitions
GO_def = suppressMessages(AnnotationDbi::select(GO.db,
                                                keys=(keys(GO.db)),
                                                columns=c("GOID","TERM","DEFINITION"),
                                                keytype="GOID"))

# Object with semantic similarity and information content
GO_semsim = godata("org.Hs.eg.db", ont="BP")

# Generate random GOs
set.seed(888)
go_id = random_GO(500)

mat = GO_similarity(go_id,
                    ont = "BP")

# For me, cluster number is different than in simplifyEnrichment vignette, maybe go_id vector is different?
df = simplifyGO(mat,plot=FALSE)
Cluster 500 terms by 'binary_cut'... 22 clusters, used 1.235794 secs.


### Retrieve representative GO term for each cluster

repr_GO = 

# Retrieve ancestral GO terms for each GO term in go_id
GO_anc[go_id] %>% stack() %>% `colnames<-`(c("ancestral", "go_id")) %>% 

# Add clusters from simplifyGO and count the number of times each ancestral GO appears in ancestral terms for each GO cluster
  left_join(dplyr::select(df, c("cluster", "id")), by = setNames("id", "go_id")) %>% 
  group_by(cluster, ancestral) %>% add_count() %>% ungroup %>% 

# Add Information Content for each ancestral GO and calculate "importance" (most informative somewhat common ancestral GO term)
  mutate(IC = GO_semsim@IC[ancestral]) %>% 
  mutate(importance = (n*IC^2)) %>% 
  group_by(cluster) %>% 
  dplyr::slice(which.max(importance)) %>% 

# Clean output
  left_join(dplyr::select(GO_def, c("GOID","TERM")), by = setNames("GOID", "ancestral")) %>% 
  dplyr::select(c("ancestral","cluster","importance","TERM"))

 repr_GO
# A tibble: 22 x 4
# Groups:   cluster [22]
   ancestral  cluster importance TERM                                         
   <chr>        <dbl>      <dbl> <chr>                                        
 1 GO:0048583       1      326.  regulation of response to stimulus           
 2 GO:0043414       2      177.  macromolecule methylation                    
 3 GO:0051641       3      406.  cellular localization                        
 4 GO:0048513       4      458.  animal organ development                     
 5 GO:0051128       5      219.  regulation of cellular component organization
 6 GO:0044281       6      399.  small molecule metabolic process             
 7 GO:0042117       7       87.1 monocyte activation                          
 8 GO:0071335       8      142.  hair follicle cell proliferation             
 9 GO:0060081       9      171.  membrane hyperpolarization                   
10 GO:0007610      10       24.8 behavior                                     
# ... with 12 more rows







# for reference:
go_id
  [1] "GO:0050918" "GO:2000050" "GO:0060968" "GO:0070460" "GO:1900744" "GO:0002839" "GO:1905453" "GO:1904325" "GO:0019827" "GO:0051224" "GO:0016557"
 [12] "GO:1902082" "GO:0007512" "GO:0030037" "GO:1903147" "GO:0098930" "GO:0061045" "GO:0006558" "GO:0035498" "GO:2000975" "GO:0002312" "GO:0010957"
 [23] "GO:0010640" "GO:0150102" "GO:0002230" "GO:0016254" "GO:1904017" "GO:0007185" "GO:0070664" "GO:0002495" "GO:0002526" "GO:0051899" "GO:0002204"
 [34] "GO:0071258" "GO:0061086" "GO:0010878" "GO:0033484" "GO:0097091" "GO:0043010" "GO:0060322" "GO:0043381" "GO:0001933" "GO:0051453" "GO:0016119"
 [45] "GO:0110099" "GO:0050668" "GO:0009051" "GO:0035675" "GO:0072227" "GO:0002862" "GO:0060915" "GO:0051383" "GO:0072718" "GO:0033512" "GO:0090037"
 [56] "GO:0048563" "GO:0014057" "GO:0035499" "GO:0030193" "GO:0010569" "GO:0099518" "GO:0060218" "GO:0051225" "GO:2000391" "GO:1905832" "GO:1904206"
 [67] "GO:0007398" "GO:0015012" "GO:0002118" "GO:2000363" "GO:1902807" "GO:1903364" "GO:0051026" "GO:0070121" "GO:0002209" "GO:0098856" "GO:0086094"
 [78] "GO:0010996" "GO:1903556" "GO:1901076" "GO:0098746" "GO:0003143" "GO:0030225" "GO:0010526" "GO:0051457" "GO:0060386" "GO:1901647" "GO:0033327"
 [89] "GO:0032252" "GO:0031104" "GO:0007274" "GO:0043408" "GO:0090150" "GO:2000659" "GO:1904273" "GO:0071679" "GO:0070429" "GO:0021510" "GO:1904472"
[100] "GO:0060981" "GO:0051602" "GO:0001580" "GO:0007112" "GO:1903940" "GO:1904858" "GO:0014857" "GO:1902108" "GO:0050787" "GO:0033598" "GO:0001542"
[111] "GO:0071872" "GO:0007339" "GO:0042982" "GO:0002215" "GO:0072739" "GO:0070453" "GO:0051664" "GO:0072660" "GO:0060847" "GO:0002280" "GO:0060921"
[122] "GO:0045050" "GO:0034349" "GO:0003156" "GO:0042267" "GO:0060337" "GO:0010212" "GO:1901096" "GO:0042574" "GO:0009893" "GO:0010963" "GO:0046620"
[133] "GO:0031297" "GO:0002362" "GO:1904978" "GO:0038183" "GO:0007140" "GO:1904544" "GO:0030851" "GO:0060369" "GO:0034440" "GO:0008037" "GO:1905828"
[144] "GO:1990428" "GO:0051598" "GO:2000116" "GO:1902631" "GO:0038123" "GO:0021904" "GO:0070242" "GO:1903596" "GO:0000165" "GO:0001787" "GO:1990267"
[155] "GO:0071347" "GO:0034625" "GO:0070563" "GO:1901222" "GO:0010452" "GO:0071931" "GO:0035905" "GO:0014059" "GO:0030178" "GO:0016255" "GO:0038092"
[166] "GO:0003310" "GO:0002508" "GO:0060430" "GO:0007019" "GO:1900825" "GO:0072198" "GO:0034138" "GO:0032510" "GO:1904398" "GO:0035630" "GO:0045823"
[177] "GO:0097503" "GO:0044598" "GO:0106072" "GO:0014896" "GO:0060587" "GO:0006044" "GO:1902691" "GO:0060216" "GO:0086014" "GO:0000045" "GO:0042414"
[188] "GO:0031666" "GO:0010070" "GO:0045191" "GO:0072201" "GO:0019673" "GO:0051296" "GO:1903069" "GO:0016098" "GO:2000007" "GO:0061566" "GO:0016191"
[199] "GO:0090179" "GO:0098719" "GO:0045967" "GO:0090370" "GO:0046628" "GO:0099641" "GO:1903576" "GO:0044546" "GO:0010643" "GO:2000777" "GO:0099551"
[210] "GO:0007418" "GO:0006264" "GO:1903645" "GO:0086091" "GO:0030166" "GO:0002237" "GO:0038168" "GO:0034765" "GO:0043570" "GO:0009116" "GO:0051497"
[221] "GO:0034760" "GO:0035212" "GO:0090292" "GO:1902307" "GO:0045540" "GO:0002876" "GO:0099044" "GO:1901021" "GO:0060762" "GO:0046034" "GO:0006254"
[232] "GO:0003279" "GO:1902889" "GO:0090042" "GO:0021772" "GO:0060350" "GO:0007186" "GO:0036023" "GO:0032075" "GO:0002225" "GO:0060792" "GO:0097473"
[243] "GO:0034721" "GO:0097101" "GO:1905165" "GO:0045986" "GO:0050773" "GO:0060841" "GO:0060872" "GO:0140469" "GO:0033540" "GO:0071569" "GO:0071425"
[254] "GO:1900195" "GO:0034959" "GO:0072107" "GO:0001894" "GO:0035459" "GO:0002840" "GO:0032965" "GO:0060070" "GO:0070392" "GO:0046079" "GO:0099642"
[265] "GO:0060164" "GO:0099607" "GO:0009165" "GO:0071033" "GO:0002693" "GO:0090263" "GO:0042157" "GO:0033632" "GO:1901652" "GO:0140466" "GO:0002761"
[276] "GO:0060299" "GO:0018149" "GO:1990108" "GO:0072053" "GO:1902548" "GO:0090494" "GO:0016240" "GO:0021670" "GO:0006539" "GO:1905833" "GO:0019370"
[287] "GO:0099505" "GO:0070383" "GO:0034391" "GO:0098586" "GO:1901389" "GO:0015838" "GO:0090091" "GO:0062026" "GO:1990519" "GO:0043046" "GO:1905751"
[298] "GO:0070649" "GO:0044854" "GO:0007166" "GO:1900227" "GO:0060127" "GO:0002098" "GO:0048169" "GO:0060137" "GO:2000741" "GO:0031334" "GO:0060765"
[309] "GO:0035543" "GO:1904292" "GO:0030308" "GO:0007031" "GO:0033215" "GO:0016486" "GO:1902607" "GO:0034392" "GO:0043217" "GO:0001816" "GO:1901536"
[320] "GO:0044827" "GO:0021568" "GO:0034773" "GO:0009257" "GO:0046786" "GO:1902803" "GO:0033561" "GO:2000979" "GO:0030521" "GO:0046086" "GO:0051660"
[331] "GO:0032048" "GO:0046890" "GO:0090340" "GO:0071298" "GO:0061772" "GO:1903217" "GO:0071622" "GO:0061909" "GO:0046203" "GO:0043052" "GO:0001570"
[342] "GO:1901475" "GO:0015782" "GO:0007028" "GO:0031017" "GO:0038113" "GO:0090721" "GO:0090038" "GO:0019395" "GO:0034627" "GO:1902883" "GO:0048703"
[353] "GO:1902669" "GO:2000288" "GO:0048611" "GO:1990535" "GO:0021756" "GO:0098921" "GO:1990910" "GO:0019511" "GO:1901340" "GO:1903489" "GO:0046425"
[364] "GO:0032958" "GO:0031103" "GO:0007216" "GO:0060516" "GO:0009072" "GO:0034397" "GO:1900095" "GO:2000791" "GO:0072125" "GO:0021610" "GO:0034214"
[375] "GO:0007040" "GO:0071338" "GO:0003366" "GO:0071897" "GO:0097114" "GO:0042133" "GO:0042490" "GO:0042543" "GO:0009104" "GO:0010085" "GO:0010920"
[386] "GO:1902630" "GO:0061402" "GO:1902203" "GO:0030488" "GO:0036049" "GO:2000167" "GO:0021812" "GO:0051086" "GO:1903142" "GO:0048672" "GO:0045039"
[397] "GO:0051284" "GO:0021612" "GO:1901726" "GO:0046464" "GO:0070507" "GO:0006225" "GO:0042494" "GO:0001525" "GO:0072207" "GO:0007062" "GO:0051164"
[408] "GO:1901522" "GO:0001755" "GO:0046470" "GO:0015961" "GO:0006127" "GO:0090119" "GO:0008361" "GO:0043569" "GO:0097476" "GO:0019432" "GO:0097403"
[419] "GO:0038173" "GO:0014051" "GO:0010838" "GO:0051461" "GO:1902938" "GO:0045661" "GO:0070309" "GO:0051102" "GO:0006942" "GO:1990442" "GO:0140058"
[430] "GO:0070830" "GO:0045113" "GO:0002357" "GO:1901797" "GO:0007072" "GO:1904980" "GO:0050975" "GO:0016052" "GO:0072003" "GO:0035239" "GO:0072538"
[441] "GO:0001844" "GO:0048745" "GO:0035360" "GO:1901565" "GO:0048630" "GO:0030513" "GO:0043368" "GO:0015829" "GO:0061419" "GO:0045947" "GO:1902960"
[452] "GO:0015729" "GO:0048738" "GO:0061565" "GO:0002184" "GO:0070668" "GO:0071641" "GO:0043254" "GO:0070778" "GO:0060319" "GO:0045686" "GO:0032072"
[463] "GO:0033631" "GO:0097106" "GO:0015728" "GO:0034276" "GO:0098506" "GO:0006172" "GO:0110077" "GO:0010592" "GO:1905279" "GO:0072356" "GO:0002456"
[474] "GO:0090402" "GO:1904261" "GO:0060575" "GO:0030449" "GO:0014916" "GO:0009263" "GO:0032740" "GO:0050974" "GO:0042752" "GO:0014010" "GO:0106217"
[485] "GO:0034140" "GO:1990823" "GO:0001560" "GO:0098990" "GO:1900758" "GO:0098849" "GO:0072675" "GO:2001013" "GO:1990858" "GO:0060158" "GO:0051904"
[496] "GO:0043928" "GO:0021888" "GO:0098813" "GO:0098968" "GO:0071409"

eggrandio avatar Apr 11 '22 13:04 eggrandio