queryTxObject adds "chr" to seqlevels, creating a mismatch with BSgenome seqlevels
I am using crisprDesign with a custom genome (Nicotiana tabacum K326) that has 24 chromosones and several scaffolds:
library(BSgenome.NtabacumK326.sierro2024)
K326_bsgenome <- BSgenome.NtabacumK326.sierro2024
> seqlevels(K326_bsgenome)
[1] "Un00001" "Un00002" "Un00003" "Un00004" "Un00006" "Un00007" "Un00008" "Un00011" "Un00016" "Un00020" "Un00022" "Un00025" "Un00026" "Un00027" "Un00029" "Un00031" "Un00035" "Un00039" "Un00041"
[20] "Un00046" "Un00048" "Un00050" "Un00051" "Un00061" "Un00065" "Un00073" "Un00074" "Un00078" "Un00080" "Un00081" "Un00082" "Un00083" "Un00084" "Un00085" "Un00088" "Un00089" "Un00092" "Un00095"
[39] "Un00098" "Un00100" "Un00105" "Un00108" "Un00118" "Un00120" "Un00123" "Un00125" "Un00128" "Un00136" "Un00139" "Un00143" "Un00144" "Un00148" "Un00150" "Un00162" "Un00163" "Un00171" "Un00172"
[58] "Un00173" "Un00175" "Un00180" "Un00194" "Un00198" "Un00199" "Un00200" "Un00205" "Un00209" "Un00212" "Un00213" "Un00214" "Un00216" "Un00220" "Un00229" "Un00230" "Un00245" "Un00249" "Un00253"
[77] "Un00258" "Un00264" "Un00266" "Un00271" "Un00273" "Un00278" "Un00280" "Un00281" "Un00288" "Un00289" "Un00292" "Un00294" "Un00298" "Un00301" "Un00305" "Un00307" "Un00311" "Un00315" "Un00316"
[96] "Un00317" "Un00318" "Un00320" "Un00321" "Un00333" "Un00334" "Un00337" "Un00339" "Un00341" "Un00342" "Un00344" "Un00346" "Un00353" "Un00361" "Un00375" "Un00380" "Un00384" "Un00385" "Un00389"
[115] "Un00390" "Un00392" "Un00393" "Un00395" "Un00400" "Un00404" "Un00405" "Un00409" "Un00410" "Un00414" "Un00415" "Un00418" "Un00420" "Un00422" "Un00427" "Un00428" "Un00429" "Un00430" "Un00433"
[134] "Un00443" "Un00447" "Un00450" "Un00453" "Un00470" "Un00474" "Un00475" "Un00476" "Un00489" "Un00490" "Un00491" "Un00495" "Un00498" "Un00499" "Un00502" "Un00504" "Un00507" "Un00510" "Un00518"
[153] "Un00521" "Un00522" "Un00528" "Un00562" "Un00564" "Un00566" "Un00567" "Un00575" "Un00582" "Un00587" "Un00594" "Un00598" "Un00601" "Un00628" "Un00634" "Un00646" "Un00661" "Un00670" "Chr01"
[172] "Chr02" "Chr03" "Chr04" "Chr05" "Chr06" "Chr07" "Chr08" "Chr09" "Chr10" "Chr11" "Chr12" "Chr13" "Chr14" "Chr15" "Chr16" "Chr17" "Chr18" "Chr19" "Chr20"
[191] "Chr21" "Chr22" "Chr23" "Chr24"
I make a TxDb object from the .gtf file, and it has matching seqlevels:
K326_txdb <- makeTxDbFromGFF("./ntab.gtf")
> seqlevels(K326_txdb)
[1] "Un00001" "Un00002" "Un00003" "Un00004" "Un00006" "Un00007" "Un00008" "Un00011" "Un00016" "Un00020" "Un00022" "Un00025" "Un00026" "Un00027" "Un00029" "Un00031" "Un00035" "Un00039" "Un00041"
[20] "Un00046" "Un00048" "Un00050" "Un00051" "Un00061" "Un00065" "Un00073" "Un00074" "Un00078" "Un00080" "Un00081" "Un00082" "Un00083" "Un00084" "Un00085" "Un00088" "Un00089" "Un00092" "Un00095"
[39] "Un00098" "Un00100" "Un00105" "Un00108" "Un00118" "Un00120" "Un00123" "Un00125" "Un00128" "Un00136" "Un00139" "Un00143" "Un00144" "Un00148" "Un00150" "Un00162" "Un00163" "Un00171" "Un00172"
[58] "Un00173" "Un00175" "Un00180" "Un00194" "Un00198" "Un00199" "Un00200" "Un00205" "Un00209" "Un00212" "Un00213" "Un00214" "Un00216" "Un00220" "Un00229" "Un00230" "Un00245" "Un00249" "Un00253"
[77] "Un00258" "Un00264" "Un00266" "Un00271" "Un00273" "Un00278" "Un00280" "Un00281" "Un00288" "Un00289" "Un00292" "Un00294" "Un00298" "Un00301" "Un00305" "Un00307" "Un00311" "Un00315" "Un00316"
[96] "Un00317" "Un00318" "Un00320" "Un00321" "Un00333" "Un00334" "Un00337" "Un00339" "Un00341" "Un00342" "Un00344" "Un00346" "Un00353" "Un00361" "Un00375" "Un00380" "Un00384" "Un00385" "Un00389"
[115] "Un00390" "Un00392" "Un00393" "Un00395" "Un00400" "Un00404" "Un00405" "Un00409" "Un00410" "Un00414" "Un00415" "Un00418" "Un00420" "Un00422" "Un00427" "Un00428" "Un00429" "Un00430" "Un00433"
[134] "Un00443" "Un00447" "Un00450" "Un00453" "Un00470" "Un00474" "Un00475" "Un00476" "Un00489" "Un00490" "Un00491" "Un00495" "Un00498" "Un00499" "Un00502" "Un00504" "Un00507" "Un00510" "Un00518"
[153] "Un00521" "Un00522" "Un00528" "Un00562" "Un00564" "Un00566" "Un00567" "Un00575" "Un00582" "Un00587" "Un00594" "Un00598" "Un00601" "Un00628" "Un00634" "Un00646" "Un00661" "Un00670" "Chr01"
[172] "Chr02" "Chr03" "Chr04" "Chr05" "Chr06" "Chr07" "Chr08" "Chr09" "Chr10" "Chr11" "Chr12" "Chr13" "Chr14" "Chr15" "Chr16" "Chr17" "Chr18" "Chr19" "Chr20"
[191] "Chr21" "Chr22" "Chr23" "Chr24"
However, when I query the txdb object with queryTxObject, I get a "new" seqlevels as it adds a preceding "chr". I understand that this is the standard way of annotating chromosomes, but in this case, if I had chr00-chr24, it would still add the "chr" to the scaffolds.
gr <- queryTxObject(txObject = K326_txdb,
featureType = "cds",
queryColumn = "gene_id",
queryValue = "Ntab00g003960.1")
> gr
GRanges object with 1 range and 14 metadata columns:
seqnames ranges strand | tx_id gene_id protein_id tx_type gene_symbol exon_id exon_rank cds_start cds_end tx_start tx_end cds_len exon_start
<Rle> <IRanges> <Rle> | <character> <character> <character> <character> <logical> <character> <integer> <integer> <integer> <integer> <integer> <integer> <integer>
region_1 chrUn00245 43414-44511 - | Ntab00g003960-1.1 Ntab00g003960.1 <NA> transcript <NA> <NA> 1 <NA> <NA> 42071 44555 1098 43371
exon_end
<integer>
region_1 44555
-------
seqinfo: 194 sequences from an unspecified genome; no seqlengths
> seqlevels(gr)
[1] "chrUn00001" "chrUn00002" "chrUn00003" "chrUn00004" "chrUn00006" "chrUn00007" "chrUn00008" "chrUn00011" "chrUn00016" "chrUn00020" "chrUn00022" "chrUn00025" "chrUn00026" "chrUn00027" "chrUn00029"
[16] "chrUn00031" "chrUn00035" "chrUn00039" "chrUn00041" "chrUn00046" "chrUn00048" "chrUn00050" "chrUn00051" "chrUn00061" "chrUn00065" "chrUn00073" "chrUn00074" "chrUn00078" "chrUn00080" "chrUn00081"
[31] "chrUn00082" "chrUn00083" "chrUn00084" "chrUn00085" "chrUn00088" "chrUn00089" "chrUn00092" "chrUn00095" "chrUn00098" "chrUn00100" "chrUn00105" "chrUn00108" "chrUn00118" "chrUn00120" "chrUn00123"
[46] "chrUn00125" "chrUn00128" "chrUn00136" "chrUn00139" "chrUn00143" "chrUn00144" "chrUn00148" "chrUn00150" "chrUn00162" "chrUn00163" "chrUn00171" "chrUn00172" "chrUn00173" "chrUn00175" "chrUn00180"
[61] "chrUn00194" "chrUn00198" "chrUn00199" "chrUn00200" "chrUn00205" "chrUn00209" "chrUn00212" "chrUn00213" "chrUn00214" "chrUn00216" "chrUn00220" "chrUn00229" "chrUn00230" "chrUn00245" "chrUn00249"
[76] "chrUn00253" "chrUn00258" "chrUn00264" "chrUn00266" "chrUn00271" "chrUn00273" "chrUn00278" "chrUn00280" "chrUn00281" "chrUn00288" "chrUn00289" "chrUn00292" "chrUn00294" "chrUn00298" "chrUn00301"
[91] "chrUn00305" "chrUn00307" "chrUn00311" "chrUn00315" "chrUn00316" "chrUn00317" "chrUn00318" "chrUn00320" "chrUn00321" "chrUn00333" "chrUn00334" "chrUn00337" "chrUn00339" "chrUn00341" "chrUn00342"
[106] "chrUn00344" "chrUn00346" "chrUn00353" "chrUn00361" "chrUn00375" "chrUn00380" "chrUn00384" "chrUn00385" "chrUn00389" "chrUn00390" "chrUn00392" "chrUn00393" "chrUn00395" "chrUn00400" "chrUn00404"
[121] "chrUn00405" "chrUn00409" "chrUn00410" "chrUn00414" "chrUn00415" "chrUn00418" "chrUn00420" "chrUn00422" "chrUn00427" "chrUn00428" "chrUn00429" "chrUn00430" "chrUn00433" "chrUn00443" "chrUn00447"
[136] "chrUn00450" "chrUn00453" "chrUn00470" "chrUn00474" "chrUn00475" "chrUn00476" "chrUn00489" "chrUn00490" "chrUn00491" "chrUn00495" "chrUn00498" "chrUn00499" "chrUn00502" "chrUn00504" "chrUn00507"
[151] "chrUn00510" "chrUn00518" "chrUn00521" "chrUn00522" "chrUn00528" "chrUn00562" "chrUn00564" "chrUn00566" "chrUn00567" "chrUn00575" "chrUn00582" "chrUn00587" "chrUn00594" "chrUn00598" "chrUn00601"
[166] "chrUn00628" "chrUn00634" "chrUn00646" "chrUn00661" "chrUn00670" "chrChr01" "chrChr02" "chrChr03" "chrChr04" "chrChr05" "chrChr06" "chrChr07" "chrChr08" "chrChr09" "chrChr10"
[181] "chrChr11" "chrChr12" "chrChr13" "chrChr14" "chrChr15" "chrChr16" "chrChr17" "chrChr18" "chrChr19" "chrChr20" "chrChr21" "chrChr22" "chrChr23" "chrChr24"
I can fix it by renaming the seqlevels, but I think that there should be an option to avoid adding the "chr" when using non-standard genomes, or assemblies with scaffolds.
> seqlevels(gr) <- sub("^chr", "", seqlevels(gr))
> gr
GRanges object with 1 range and 14 metadata columns:
seqnames ranges strand | tx_id gene_id protein_id tx_type gene_symbol exon_id exon_rank cds_start cds_end tx_start tx_end cds_len exon_start
<Rle> <IRanges> <Rle> | <character> <character> <character> <character> <logical> <character> <integer> <integer> <integer> <integer> <integer> <integer> <integer>
region_1 Un00245 43414-44511 - | Ntab00g003960-1.1 Ntab00g003960.1 <NA> transcript <NA> <NA> 1 <NA> <NA> 42071 44555 1098 43371
exon_end
<integer>
region_1 44555
-------
seqinfo: 194 sequences from an unspecified genome; no seqlengths
> seqlevels(gr)
[1] "Un00001" "Un00002" "Un00003" "Un00004" "Un00006" "Un00007" "Un00008" "Un00011" "Un00016" "Un00020" "Un00022" "Un00025" "Un00026" "Un00027" "Un00029" "Un00031" "Un00035" "Un00039" "Un00041"
[20] "Un00046" "Un00048" "Un00050" "Un00051" "Un00061" "Un00065" "Un00073" "Un00074" "Un00078" "Un00080" "Un00081" "Un00082" "Un00083" "Un00084" "Un00085" "Un00088" "Un00089" "Un00092" "Un00095"
[39] "Un00098" "Un00100" "Un00105" "Un00108" "Un00118" "Un00120" "Un00123" "Un00125" "Un00128" "Un00136" "Un00139" "Un00143" "Un00144" "Un00148" "Un00150" "Un00162" "Un00163" "Un00171" "Un00172"
[58] "Un00173" "Un00175" "Un00180" "Un00194" "Un00198" "Un00199" "Un00200" "Un00205" "Un00209" "Un00212" "Un00213" "Un00214" "Un00216" "Un00220" "Un00229" "Un00230" "Un00245" "Un00249" "Un00253"
[77] "Un00258" "Un00264" "Un00266" "Un00271" "Un00273" "Un00278" "Un00280" "Un00281" "Un00288" "Un00289" "Un00292" "Un00294" "Un00298" "Un00301" "Un00305" "Un00307" "Un00311" "Un00315" "Un00316"
[96] "Un00317" "Un00318" "Un00320" "Un00321" "Un00333" "Un00334" "Un00337" "Un00339" "Un00341" "Un00342" "Un00344" "Un00346" "Un00353" "Un00361" "Un00375" "Un00380" "Un00384" "Un00385" "Un00389"
[115] "Un00390" "Un00392" "Un00393" "Un00395" "Un00400" "Un00404" "Un00405" "Un00409" "Un00410" "Un00414" "Un00415" "Un00418" "Un00420" "Un00422" "Un00427" "Un00428" "Un00429" "Un00430" "Un00433"
[134] "Un00443" "Un00447" "Un00450" "Un00453" "Un00470" "Un00474" "Un00475" "Un00476" "Un00489" "Un00490" "Un00491" "Un00495" "Un00498" "Un00499" "Un00502" "Un00504" "Un00507" "Un00510" "Un00518"
[153] "Un00521" "Un00522" "Un00528" "Un00562" "Un00564" "Un00566" "Un00567" "Un00575" "Un00582" "Un00587" "Un00594" "Un00598" "Un00601" "Un00628" "Un00634" "Un00646" "Un00661" "Un00670" "Chr01"
[172] "Chr02" "Chr03" "Chr04" "Chr05" "Chr06" "Chr07" "Chr08" "Chr09" "Chr10" "Chr11" "Chr12" "Chr13" "Chr14" "Chr15" "Chr16" "Chr17" "Chr18" "Chr19" "Chr20"
[191] "Chr21" "Chr22" "Chr23" "Chr24"
Hi @eggrandio,
Indeed, that shouldn't happen.
The best way to avoid that is to convert the txdb object into a txObject using the function crisprDesign::TxDb2GRangesList
with the argument seqlevelsStyle="custom", and use that object for all downstream analyses.
I've added the argument seqlevelsStyle="custom" in the 65ee39d branch. The seqlevels will remain unchanged.
Let me know if that works for you
Hello,
I think that using custom seqlevelStyles also raises a problem in addGeneAnnotation().
Modifying the seqlevels of the gr object:
K326_txdb <- txdbmaker::makeTxDbFromGFF("./ntab_fixed.gtf")
K326_txdb %>% seqlevelsStyle()
[1] "ASM9120v1"
gr <- queryTxObject(txObject = K326_txdb,
featureType = "cds",
queryColumn = "gene_id",
queryValue = "Ntab00g003960.1")
seqlevels(gr) <- sub("^chr", "", seqlevels(gr))
# find guides ####
bowtie_index <- "./ntab_bowtie_index/K326"
guideSet <- findSpacers(gr,
bsgenome = K326_bsgenome,
crisprNuclease = AsCas12a) %>%
addSequenceFeatures() %>%
addRestrictionEnzymes(enzymeNames = c("BsaI", "BsmBI"),
includeDefault = FALSE,
flanking5 = "agat",
flanking3 = "taat") %>%
mutate(enzymeAnnotation_flag = sapply(enzymeAnnotation, function(x) any(unlist(x)))) %>%
filter(!polyA & !polyC & !polyG & !polyT & !startingGGGGG & !enzymeAnnotation_flag)
guideSet <- addSpacerAlignments(guideSet,
aligner = "bowtie",
aligner_index = bowtie_index,
bsgenome = K326_bsgenome,
n_mismatches = 3,
txObject = K326_txdb)
# THIS IS WHERE IT FAILS ####
guideSet_test <- addGeneAnnotation(guideSet,
txObject = K326_txdb)
Error in .replace_seqlevels_style(x_seqlevels, value) :
found no sequence renaming map compatible with seqname style "ASM9120v1" for this object
If I use the txdb object as GRanges using the "custom" seqlevelsStyle , it also fails:
K326_txdb2 <- crisprDesign::TxDb2GRangesList(K326_txdb,
seqlevelsStyle = "custom")
K326_txdb2 %>% seqlevelsStyle()
[1] "ASM9120v1"
guideSet <- findSpacers(gr,
bsgenome = K326_bsgenome,
crisprNuclease = AsCas12a) %>%
addSequenceFeatures() %>%
addRestrictionEnzymes(enzymeNames = c("BsaI", "BsmBI"),
includeDefault = FALSE,
flanking5 = "agat",
flanking3 = "taat") %>%
mutate(enzymeAnnotation_flag = sapply(enzymeAnnotation, function(x) any(unlist(x)))) %>%
filter(!polyA & !polyC & !polyG & !polyT & !startingGGGGG & !enzymeAnnotation_flag)
guideSet <- addSpacerAlignments(guideSet,
aligner = "bowtie",
aligner_index = bowtie_index,
bsgenome = K326_bsgenome,
n_mismatches = 3,
txObject = K326_txdb2)
guideSet_test <- addGeneAnnotation(guideSet,
txObject = K326_txdb2)
Error in .new_IRanges_from_start_end(start, end) :
'start' or 'end' cannot contain NAs
@eggrandio Could you share with me your K326_txdb object?
Hello,
I was able to obtain the overlapping offtarget genes with a custom script, but I guess that more people that want to use their custom genome/annotations might find the same issue.
Here is the K326_txdb object as rds: K326_txdb.rds Fore reference, I generated it from this .gtf file: ntab_fixed.gtf Which, in turn is a properly formatted .gtf file version of the .gff from this publication: K326 genome and .gff https://doi.org/10.5281/zenodo.14816438 Sierro, N., Auberson, M., Dulize, R. et al. Chromosome-level genome assemblies of Nicotiana tabacum, Nicotiana sylvestris, and Nicotiana tomentosiformis. Sci Data 11, 135 (2024). https://doi.org/10.1038/s41597-024-02965-2
@eggrandio Did you try converting first your txdb object into a granges list before feeding it into the downstream functions? Something like
txdb <- crisprDesign::TxDb2GRangesList(K326_txdb, seqlevelsStyle = "custom")
txdb <- TxDb2GRangesList(txdb)
Yes. In that case I got this error when I tried to add gene annotations:
Error in .new_IRanges_from_start_end(start, end) :
'start' or 'end' cannot contain NAs
I had to upgrade R to use the "custom" selevelsstyle so then I had to use the txdbmaker version of maketxdbfromgff (I think the new version uses Seqinfo).
Can you use the file I uploaded without issues ?
I haven't had the time yet; did upgrading R and to the devel branch of Bioc resolve everything?
Nope. I was mentioning it because I had to upgrade to use the customs seqlevels. But it did not fix the issue, it just gave a different error.
@eggrandio Can you share with me your custom BSgenome package?
@eggrandio and bowtie index files