dandelion
dandelion copied to clipboard
non-10x input (just fasta) e.g. immunoseq
Description of the issue
Feature request by @[email protected]
I would like to check if we could run the data output from the immunoseq ? I have a BCR dataset that I would like to analyse via your package and output different than 10x. I wonder if you have any script to import the data analysis via your package pipeline.
Implementation will be something like:
import os
import pandas as pd
import shutil
from pathlib import Path
from tqdm import tqdm
from typing import List, Optional
from dandelion.utilities._io import fasta_iterator, Write_output
def prepare_non10x_fasta(
fasta: str,
outdir: Optional[str] = None,
):
"""
Prepare a non-10x fasta so that it can be ingested like for downstream analysis.
Parameters
----------
fasta : str
path to fasta file.
outdir : Optional[str], optional
path to output location. `None` defaults to 'dandelion'.
"""
fh = open(fasta, "r")
seqs = {}
for header, sequence in fasta_iterator(fh):
seqs[header] = sequence
fh.close()
basedir = os.path.dirname(fasta)
if outdir is None:
outdir = basedir.rstrip("/") + "/" + Path(os.path.basename(fasta)).stem
if not outdir.endswith("/"):
outdir = outdir + "/"
if not os.path.exists(outdir):
os.makedirs(outdir)
out_fasta = outdir + "all_contig.fasta"
out_anno_path = outdir + "all_contig_annotations.csv"
fh1 = open(out_fasta, "w")
fh1.close()
out = ""
anno = []
for l in seqs:
out = ">" + l + "-1_contig-1" + "\n" + seqs[l] + "\n"
Write_output(out, out_fasta)
# also create a dummy contig_annotations.csv
defaultrow = {
"barcode": l,
"is_cell": "TRUE",
"contig_id": l + "-1_contig-1",
"high_confidence": "TRUE",
"length": str(len(seqs[l])),
"chain": "None",
"v_gene": "None",
"d_gene": "None",
"j_gene": "None",
"c_gene": "None",
"full_length": "TRUE",
"productive": "TRUE",
"cdr3": "None",
"cdr3_nt": "None",
"reads": 1,
"umis": 1,
"raw_clonotype_id": "None",
"raw_consensus_id": "None",
}
anno.append(defaultrow)
anno = pd.DataFrame(anno)
anno.to_csv(out_anno_path, index=False)
def prepare_non10x_fastas(
fastas: List[str],
outdir: Optional[str] = None,
):
"""
Prepare a non-10x fastas so that it can be ingested like for downstream analysis.
Parameters
----------
fastas : List[str]
list of paths to fasta files.
outdir : Optional[str], optional
path to out put location.
"""
if type(fastas) is not list:
fastas = [fastas]
for i in tqdm(
range(0, len(fastas)),
desc="Formating fasta(s) ",
bar_format="{l_bar}{bar:10}{r_bar}{bar:-10b}",
):
prepare_non10x_fasta(
fastas[i],
outdir=outdir,
)
usage would be:
import os
import dandelion as ddl
files = [
"/Users/kt16/Downloads/immunoseqtest/sample-1.fasta",
"/Users/kt16/Downloads/immunoseqtest/sample-2.fasta",
]
prepare_non10x_fastas(files)
# and then either just use the singularity image from here onwards
# singularity run sc-dandelion.sif dandelion-preprocess
# or just do it manually
os.chdir("/Users/kt16/Downloads/immunoseqtest/")
samples = ["sample-1", "sample-2"]
ddl.pp.format_fastas(samples, prefix=samples, filename_prefix=["all", "all"])
ddl.pp.reannotate_genes(samples, filename_prefix="all") # etc
Thanks @zktuong .
Hello @zktuong
I have a issue abit.
evrything was fine until the prepare_non10x_fastas(files) then i get the below error.
import os
import dandelion as ddl
files = [
"/Users/sei/Documents/data/BCR/Results_fasta_2022-10-18_06-27/BAFF-Tg_166_ACKR3-1.fasta",
"/Users/sei/Documents/data/BCR/Results_fasta_2022-10-18_06-27/BAFF-Tg_166_ACKR3-2.fasta",
"/Users/sei/Documents/data/BCR/Results_fasta_2022-10-18_06-27/BAFF-Tg_167_ACKR3-1.fasta",
"/Users/sei/Documents/data/BCR/Results_fasta_2022-10-18_06-27/BAFF-Tg_167_ACKR3-2.fasta",
"/Users/sei/Documents/data/BCR/Results_fasta_2022-10-18_06-27/BAFF-Tg_193_ACKR3-1.fasta",
"/Users/sei/Documents/data/BCR/Results_fasta_2022-10-18_06-27/BAFF-Tg_193_ACKR3-2.fasta",
'/Users/sei/Documents/data/BCR/Results_fasta_2022-10-18_06-27/Wt_190_ACKR3-1.fasta',
'/Users/sei/Documents/data/BCR/Results_fasta_2022-10-18_06-27/Wt_190_ACKR3-2.fasta',
'/Users/sei/Documents/data/BCR/Results_fasta_2022-10-18_06-27/Wt_197_ACKR3-1.fasta',
'/Users/sei/Documents/data/BCR/Results_fasta_2022-10-18_06-27/Wt_197_ACKR3-2.fasta',
'/Users/sei/Documents/data/BCR/Results_fasta_2022-10-18_06-27/Wt_199_ACKR3-1.fasta',
'/Users/sei/Documents/data/BCR/Results_fasta_2022-10-18_06-27/Wt_199_ACKR3-2.fasta',
]
prepare_non10x_fastas(files)
This is create the multiple folder which contain the all_contig.fasta and all_contig_annonation.csv
os.chdir("/Users/sei/Documents/data/BCR/Results_fasta_2022-10-18_06-27/")
samples = ["BAFF-Tg_166_ACKR3-1",
"BAFF-Tg_166_ACKR3-2",
"BAFF-Tg_167_ACKR3-1",
"BAFF-Tg_167_ACKR3-2",
"BAFF-Tg_193_ACKR3-1",
"BAFF-Tg_193_ACKR3-2",
'Wt_190_ACKR3-1',
'Wt_190_ACKR3-2',
'Wt_197_ACKR3-1',
'Wt_197_ACKR3-2',
'Wt_199_ACKR3-1',
'Wt_199_ACKR3-2'
]
ddl.pp.format_fastas(samples, prefix=samples, filename_prefix=["all", "all"])
ddl.pp.reannotate_genes(samples, filename_prefix="all") # etc
Formating fasta(s) : 100%|██████████| 12/12 [00:26<00:00, 2.19s/it]
Formating fasta(s) : 17%|█▋ | 2/12 [00:05<00:28, 2.84s/it]
Output exceeds the [size limit](command:workbench.action.openSettings?[). Open the full output data [in a text editor](command:workbench.action.openLargeOutput?ea2eb95a-02d2-468f-9d09-913dc51f17b9)
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
Cell In [12], line 15
1 os.chdir("/Users/sei/Documents/data/BCR/Results_fasta_2022-10-18_06-27/")
2 samples = ["BAFF-Tg_166_ACKR3-1",
3 "BAFF-Tg_166_ACKR3-2",
4 "BAFF-Tg_167_ACKR3-1",
(...)
13 'Wt_199_ACKR3-2'
14 ]
---> 15 ddl.pp.format_fastas(samples, prefix=samples, filename_prefix=["all", "all"])
16 ddl.pp.reannotate_genes(samples, filename_prefix="all")
File ~/opt/miniconda3/envs/SCVI/lib/python3.8/site-packages/dandelion/preprocessing/_preprocessing.py:415, in format_fastas(fastas, prefix, suffix, sep, remove_trailing_hyphen_number, high_confidence_filtering, outdir, filename_prefix)
396 format_fasta(
397 fastas[i],
398 prefix=prefix_dict[fastas[i]],
(...)
404 filename_prefix=filename_prefix[i],
405 )
406 else:
407 format_fasta(
408 fastas[i],
409 prefix=prefix_dict[fastas[i]],
410 suffix=None,
...
416 )
417 else:
418 if suffix is not None:
IndexError: list index out of range
If i run the single comment
samples = [
"BAFF-Tg_166_ACKR3-1",
# "BAFF-Tg_166_ACKR3-2",
# "BAFF-Tg_167_ACKR3-1",
# "BAFF-Tg_167_ACKR3-2",
# "BAFF-Tg_193_ACKR3-1",
# "BAFF-Tg_193_ACKR3-2",
# 'Wt_190_ACKR3-1',
# 'Wt_190_ACKR3-2',
# 'Wt_197_ACKR3-1',
# 'Wt_197_ACKR3-2',
# 'Wt_199_ACKR3-1',
# 'Wt_199_ACKR3-2',
]
ddl.pp.format_fastas(samples, prefix=samples, filename_prefix=["all", "all"])
ddl.pp.reannotate_genes(samples, filename_prefix="all") # etc
I am having is error
Formating fasta(s) : 100%|██████████| 1/1 [00:03<00:00, 3.12s/it]
Assigning genes : 0%| | 0/1 [00:00<?, ?it/s]
Output exceeds the [size limit](command:workbench.action.openSettings?[). Open the full output data [in a text editor](command:workbench.action.openLargeOutput?bab481c0-9daf-4627-9cc2-ce8fba7378a8)
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
File ~/opt/miniconda3/envs/SCVI/lib/python3.8/site-packages/dandelion/preprocessing/_preprocessing.py:4303, in run_igblastn(fasta, igblast_db, org, loci, evalue, min_d_match, verbose)
4302 try:
-> 4303 igdb = env["IGDATA"]
4304 except KeyError:
KeyError: 'IGDATA'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
Cell In [14], line 16
2 samples = ["BAFF-Tg_166_ACKR3-1",
3 # "BAFF-Tg_166_ACKR3-2",
4 # "BAFF-Tg_167_ACKR3-1",
(...)
13 # 'Wt_199_ACKR3-2',
14 ]
15 ddl.pp.format_fastas(samples, prefix=samples, filename_prefix=["all", "all"])
---> 16 ddl.pp.reannotate_genes(samples, filename_prefix="all")
File ~/opt/miniconda3/envs/SCVI/lib/python3.8/site-packages/dandelion/preprocessing/_preprocessing.py:1040, in reannotate_genes(data, igblast_db, germline, org, loci, extended, filename_prefix, flavour, min_j_match, min_d_match, v_evalue, d_evalue, j_evalue, reassign_dj, overwrite, dust, verbose)
1032 assigngenes_igblast(
1033 filePath,
...
4310 )
4311 else:
4312 env["IGDATA"] = igblast_db
KeyError: 'Environmental variable IGDATA must be set. Otherwise, please provide path to igblast database'
I really appriciate your helps
you will need to set up your database locations:
you can refer to this to see how to do it:
https://sc-dandelion.readthedocs.io/en/latest/notebooks/1_dandelion_preprocessing-10x_data.html
alternatively, when you run ddl.pp.reannotate_genes
you have to specify it directly : ddl.pp.reannotate_genes(samples, igblast_db = "database/igblast/", germline = "database/germlines/imgt/human/vdj/")
or, just use the singularity container: https://sc-dandelion.readthedocs.io/en/latest/notebooks/Q1-singularity-preprocessing.html
@zktuong
Thank you.