TCGAbiolinks
TCGAbiolinks copied to clipboard
How to get CNV status and cytoband information
Before GDC made the recent changes, I was able to get GISTIC scores with values such as -1, 0 and +1 using Gene Level Copy Number Scores
. Now with the ASCAT copy number values, I am not sure how to get information on gain, loss, deletion and amplification. The new data also does not have cytoband info like before. Any ideas on how to get 1) cnv status
and 2) cytoband
information?
# absolute copy number scores
query <- GDCquery(project = "TCGA-GBM",
data.category = "Copy Number Variation",
data.type = "Gene Level Copy Number",
barcode = c("TCGA-14-2555-01B-01D-0911-01", "TCGA-32-4213-01A-01D-1224-01"))
GDCdownload(query)
cnv_data <- GDCprepare(query, summarizedExperiment = F)
# only select copy_number columns
cnv_data <- cnv_data %>%
dplyr::select(-c(grep('_min_copy_number|_max_copy_number', colnames(cnv_data))))
colnames(cnv_data) <- gsub(",.*", "", colnames(cnv_data))
head(cnv_data)
# A tibble: 6 × 7
gene_id gene_name chromosome start end `TCGA-14-2555-01B-01D-0911-01` `TCGA-32-4213-01A-01D-1224-01`
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 ENSG00000223972.5 DDX11L1 chr1 11869 14409 NA NA
2 ENSG00000227232.5 WASH7P chr1 14404 29570 NA NA
3 ENSG00000278267.1 MIR6859-1 chr1 17369 17436 NA NA
4 ENSG00000243485.5 MIR1302-2HG chr1 29554 31109 NA NA
5 ENSG00000284332.1 MIR1302-2 chr1 30366 30503 NA NA
6 ENSG00000237613.2 FAM138A chr1 34554 36081 NA NA
# values are from 0 to 36 for 1 sample and 0-29 for the second sample
# how to interpret gain/loss from this info?
> summary(cnv_data[ c("TCGA-14-2555-01B-01D-0911-01", "TCGA-32-4213-01A-01D-1224-01")])
TCGA-14-2555-01B-01D-0911-01 TCGA-32-4213-01A-01D-1224-01
Min. : 0.000 Min. : 0.000
1st Qu.: 2.000 1st Qu.: 2.000
Median : 2.000 Median : 2.000
Mean : 2.078 Mean : 1.936
3rd Qu.: 2.000 3rd Qu.: 2.000
Max. :36.000 Max. :29.000
NA's :350 NA's :352
Any help would be much appreciated. Thanks!!
ASCAT copy number values are also new for me.
You should be able to get cytoband information with biomaRt
library(biomaRt)
ensembl <- useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl")
attributes <- c(
"chromosome_name",
"start_position",
"end_position",
"strand",
"ensembl_gene_id",
"entrezgene_id",
"external_gene_name",
"band"
)
gene.info <- getBM(
attributes = attributes,
mart = ensembl
)
gene.info <- gene.info[gene.info$band != "",]
Thanks @tiagochst - do you mind if I keep this ticket open just in case you or someone else can figure it out in the near future?
@komalsrathi Sure!
@komalsrathi Did you try posting on Biostar? I recall reading this one here: https://www.biostars.org/p/494728/, but I still need to read the article about the new algorithms.
@tiagochst I have not, but that is a good idea. What I have done right now is 1) get total copy number like above 2) get ploidy from clinical data and used this reference to get status: https://cancer.sanger.ac.uk/cosmic/help/cnv/overview. I am not sure if this is the correct way to do it so maybe I'll post my strategy on Biostars for feedback.
suppressPackageStartupMessages({
library(TCGAbiolinks)
library(tidyverse)
library(biomaRt)
library(SummarizedExperiment)
})
# get copy number from GDC (this example is using TCGA-GBM but this will be generalized within the reporting code)
query <- GDCquery(project = "TCGA-GBM",
data.category = "Copy Number Variation",
data.type = "Gene Level Copy Number")
GDCdownload(query)
# get ploidy info from clinical data
cnv_data <- GDCprepare(query, summarizedExperiment = T)
clin_data <- SummarizedExperiment::colData(cnv_data) %>%
as.data.frame() %>%
mutate(sample_id = paste0(sample_submitter_id, "-", sample_type_id),
barcode = gsub(",.*", "", barcode)) %>%
dplyr::select(barcode, sample_id, paper_ABSOLUTE.ploidy) %>%
filter(!is.na(paper_ABSOLUTE.ploidy))
# get absolute copy number scores (ASCAT)
cnv_data <- GDCprepare(query, summarizedExperiment = F)
# there are values with min_copy_number and max_copy_number but I am not using them
cnv_data <- cnv_data %>%
dplyr::select(-c(grep('_min_copy_number|_max_copy_number', colnames(cnv_data))))
# format column names to only keep barcode
colnames(cnv_data) <- gsub(",.*", "", colnames(cnv_data))
# convert to long format
cnv_data <- cnv_data %>%
dplyr::select(-c(chromosome, start, end)) %>%
dplyr::rename("hgnc_symbol" = "gene_name",
"ensembl" = "gene_id") %>%
gather('barcode', 'copy_number', -c("hgnc_symbol", "ensembl")) %>%
filter(!is.na(copy_number))
cnv_data <- cnv_data %>%
inner_join(clin_data, by = "barcode")
# map status using absolute copy number and ploidy
# taken from https://cancer.sanger.ac.uk/cosmic/help/cnv/overview
cnv_data <- cnv_data %>%
dplyr::rename("ploidy" = "paper_ABSOLUTE.ploidy") %>%
mutate(status = ifelse(test = (ploidy <= 2.7 & copy_number >= 5) | (ploidy > 2.7 & copy_number >= 9),
yes = "Gain",
no = ifelse(test = (ploidy <= 2.7 & copy_number == 0) | (ploidy > 2.7 & copy_number < (ploidy-2.7)),
yes = "Loss",
no = "Neutral")))
# map cytoband information to gene symbols
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
my_regions <- getBM(c("hgnc_symbol", "band"),
filters = c("hgnc_symbol"),
values = list(hgnc_symbol = unique(cnv_data$hgnc_symbol)),
mart = ensembl)
my_regions <- my_regions %>%
filter(band != "") %>%
dplyr::rename("cytoband" = "band")
cnv_data <- cnv_data %>%
left_join(my_regions, by = "hgnc_symbol")
@tiagochst I have not, but that is a good idea. What I have done right now is 1) get total copy number like above 2) get ploidy from clinical data and used this reference to get status: https://cancer.sanger.ac.uk/cosmic/help/cnv/overview. I am not sure if this is the correct way to do it so maybe I'll post my strategy on Biostars for feedback.
suppressPackageStartupMessages({ library(TCGAbiolinks) library(tidyverse) library(biomaRt) library(SummarizedExperiment) }) # get copy number from GDC (this example is using TCGA-GBM but this will be generalized within the reporting code) query <- GDCquery(project = "TCGA-GBM", data.category = "Copy Number Variation", data.type = "Gene Level Copy Number") GDCdownload(query) # get ploidy info from clinical data cnv_data <- GDCprepare(query, summarizedExperiment = T) clin_data <- SummarizedExperiment::colData(cnv_data) %>% as.data.frame() %>% mutate(sample_id = paste0(sample_submitter_id, "-", sample_type_id), barcode = gsub(",.*", "", barcode)) %>% dplyr::select(barcode, sample_id, paper_ABSOLUTE.ploidy) %>% filter(!is.na(paper_ABSOLUTE.ploidy)) # get absolute copy number scores (ASCAT) cnv_data <- GDCprepare(query, summarizedExperiment = F) # there are values with min_copy_number and max_copy_number but I am not using them cnv_data <- cnv_data %>% dplyr::select(-c(grep('_min_copy_number|_max_copy_number', colnames(cnv_data)))) # format column names to only keep barcode colnames(cnv_data) <- gsub(",.*", "", colnames(cnv_data)) # convert to long format cnv_data <- cnv_data %>% dplyr::select(-c(chromosome, start, end)) %>% dplyr::rename("hgnc_symbol" = "gene_name", "ensembl" = "gene_id") %>% gather('barcode', 'copy_number', -c("hgnc_symbol", "ensembl")) %>% filter(!is.na(copy_number)) cnv_data <- cnv_data %>% inner_join(clin_data, by = "barcode") # map status using absolute copy number and ploidy # taken from https://cancer.sanger.ac.uk/cosmic/help/cnv/overview cnv_data <- cnv_data %>% dplyr::rename("ploidy" = "paper_ABSOLUTE.ploidy") %>% mutate(status = ifelse(test = (ploidy <= 2.7 & copy_number >= 5) | (ploidy > 2.7 & copy_number >= 9), yes = "Gain", no = ifelse(test = (ploidy <= 2.7 & copy_number == 0) | (ploidy > 2.7 & copy_number < (ploidy-2.7)), yes = "Loss", no = "Neutral"))) # map cytoband information to gene symbols ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl") my_regions <- getBM(c("hgnc_symbol", "band"), filters = c("hgnc_symbol"), values = list(hgnc_symbol = unique(cnv_data$hgnc_symbol)), mart = ensembl) my_regions <- my_regions %>% filter(band != "") %>% dplyr::rename("cytoband" = "band") cnv_data <- cnv_data %>% left_join(my_regions, by = "hgnc_symbol")
Thank you very much for your answer,but I want to konw how did you handle it the duplicated samples duplicated?such as download the PAAD Copy Number Variation,I get the following error:
cnv_data <- GDCprepare(query, summarizedExperiment = T) | |cases |experimental_strategy |analysis_workflow_type | |:--|:----------------------------|:---------------------|:----------------------| |25 |TCGA-HZ-A9TJ-10A-01D-A40V-01 |Genotyping Array |ASCAT2 | |26 |TCGA-HZ-A9TJ-10A-01D-A40V-01 |Genotyping Array |ASCAT2 | Error in GDCprepare(query, summarizedExperiment = T) : There are samples duplicated. We will not be able to prepare it
Any help would be greatly appreciated!
@xiaolan552
The following code is working for me. Probably you need to update TCGAbiolinks with BiocManager::install("BioinformaticsFMRP/TCGAbiolinks")
query <- GDCquery(
project = "TCGA-PAAD",
data.category = "Copy Number Variation",
data.type = "Gene Level Copy Number"
)
GDCdownload(query)
data <- GDCprepare(query)
The two samples are below.
grep("TCGA-HZ-A9TJ-",query$results[[1]]$cases,value = T) [1] "TCGA-HZ-A9TJ-06A-11D-A40V-01,TCGA-HZ-A9TJ-10A-01D-A40V-01" [2] "TCGA-HZ-A9TJ-01A-11D-A40V-01,TCGA-HZ-A9TJ-10A-01D-A40V-01"
@tiagochst Thank you very much!! I'll try it right away。